Python launch_gateway Examples, pyspark.java_gateway.launch_gateway Python Examples

Example #1

0

Show file

File: testconfig.py Project: TresAmigosSD/SMV

 def sparkSession(cls):
     if not hasattr(cls, "spark"):
         # We can't use the SparkSession Builder here, since we need to call
         # Scala side's SmvTestHive.createContext to create the HiveTestContext's
         # SparkSession.
         # So we need to
         #   * Create a java_gateway
         #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
         #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
         #   * Create Scala side HiveTestContext SparkSession
         #   * Create python SparkSession
         jgw = launch_gateway(None)
         jvm = jgw.jvm
         import tempfile
         import getpass
         hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
         sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                           .set("spark.sql.hive.metastore.barrierPrefixes",
                                                "org.apache.spark.sql.hive.execution.PairSerDe")\
                                           .set("spark.sql.warehouse.dir", hivedir)\
                                           .set("spark.ui.enabled", "false")
         sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
         jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
         cls.spark = SparkSession(sc, jss.sparkSession())
     return cls.spark

Example #2

0

Show file

File: context.py Project: onestepbystep/spark_core

    def _ensure_initialized(cls, instance=None, gateway=None, conf=None):
        """
        Checks whether a SparkContext is initialized or not.
        Throws error if a SparkContext is already running.
        """
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = gateway or launch_gateway(conf)
                SparkContext._jvm = SparkContext._gateway.jvm

            if instance:
                if (SparkContext._active_spark_context
                        and SparkContext._active_spark_context != instance):
                    currentMaster = SparkContext._active_spark_context.master
                    currentAppName = SparkContext._active_spark_context.appName
                    callsite = SparkContext._active_spark_context._callsite

                    # Raise error if there is already a running Spark context
                    raise ValueError(
                        "Cannot run multiple SparkContexts at once; "
                        "existing SparkContext(app=%s, master=%s)"
                        " created by %s at %s:%s " %
                        (currentAppName, currentMaster, callsite.function,
                         callsite.file, callsite.linenum))
                else:
                    SparkContext._active_spark_context = instance

Example #3

0

Show file

File: context.py Project: AllenShi/spark

    def _ensure_initialized(cls, instance=None, gateway=None, conf=None):
        """
        Checks whether a SparkContext is initialized or not.
        Throws error if a SparkContext is already running.
        """
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = gateway or launch_gateway(conf)
                SparkContext._jvm = SparkContext._gateway.jvm

            if instance:
                if (SparkContext._active_spark_context and
                        SparkContext._active_spark_context != instance):
                    currentMaster = SparkContext._active_spark_context.master
                    currentAppName = SparkContext._active_spark_context.appName
                    callsite = SparkContext._active_spark_context._callsite

                    # Raise error if there is already a running Spark context
                    raise ValueError(
                        "Cannot run multiple SparkContexts at once; "
                        "existing SparkContext(app=%s, master=%s)"
                        " created by %s at %s:%s "
                        % (currentAppName, currentMaster,
                            callsite.function, callsite.file, callsite.linenum))
                else:
                    SparkContext._active_spark_context = instance

Example #4

0

Show file

File: __init__.py Project: hatrungduc/spark-nlp

            def __init__(self):
                spark_conf = SparkConf()
                spark_conf.setAppName(spark_nlp_config.app_name)
                spark_conf.setMaster(spark_nlp_config.master)
                spark_conf.set("spark.driver.memory", memory)
                spark_conf.set("spark.serializer", spark_nlp_config.serializer)
                spark_conf.set("spark.kryoserializer.buffer.max",
                               spark_nlp_config.serializer_max_buffer)
                spark_conf.set("spark.driver.maxResultSize",
                               spark_nlp_config.driver_max_result_size)

                if spark32:
                    spark_conf.set("spark.jars.packages",
                                   spark_nlp_config.maven_spark32)
                elif gpu and spark32:
                    spark_conf.set("spark.jars.packages",
                                   spark_nlp_config.maven_gpu_spark32)
                elif gpu:
                    spark_conf.set("spark.jars.packages",
                                   spark_nlp_config.maven_gpu_spark)
                else:
                    spark_conf.set("spark.jars.packages",
                                   spark_nlp_config.maven_spark)

                if cache_folder != '':
                    spark_conf.config(
                        "spark.jsl.settings.pretrained.cache_folder",
                        cache_folder)
                if log_folder != '':
                    spark_conf.config(
                        "spark.jsl.settings.annotator.log_folder", log_folder)
                if cluster_tmp_dir != '':
                    spark_conf.config(
                        "spark.jsl.settings.storage.cluster_tmp_dir",
                        cluster_tmp_dir)

                # Make the py4j JVM stdout and stderr available without buffering
                popen_kwargs = {
                    'stdout': subprocess.PIPE,
                    'stderr': subprocess.PIPE,
                    'bufsize': 0
                }

                # Launch the gateway with our custom settings
                self.gateway = launch_gateway(conf=spark_conf,
                                              popen_kwargs=popen_kwargs)
                self.process = self.gateway.proc
                # Use the gateway we launched
                spark_context = SparkContext(gateway=self.gateway)
                self.spark_session = SparkSession(spark_context)

                self.out_thread = threading.Thread(target=self.output_reader)
                self.error_thread = threading.Thread(target=self.error_reader)
                self.std_background_listeners()

Example #5

0

Show file

File: context.py Project: sankumarbigdata/spark-client-test

    def _ensure_initialized(cls, instance=None):
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = launch_gateway()
                SparkContext._jvm = SparkContext._gateway.jvm
                SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile

            if instance:
                if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
                    raise ValueError("Cannot run multiple SparkContexts at once")
                else:
                    SparkContext._active_spark_context = instance

Example #6

0

Show file

File: context.py Project: ComplexQubit/incubator-spark

    def _ensure_initialized(cls, instance=None, gateway=None):
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = gateway or launch_gateway()
                SparkContext._jvm = SparkContext._gateway.jvm
                SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile

            if instance:
                if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
                    raise ValueError("Cannot run multiple SparkContexts at once")
                else:
                    SparkContext._active_spark_context = instance

Example #7

0

Show file

File: context.py Project: AustinBGibbons/incubator-spark

    def _ensure_initialized(cls, instance=None):
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = launch_gateway()
                SparkContext._jvm = SparkContext._gateway.jvm
                SparkContext._writeIteratorToPickleFile = \
                    SparkContext._jvm.PythonRDD.writeIteratorToPickleFile
                SparkContext._takePartition = \
                    SparkContext._jvm.PythonRDD.takePartition

            if instance:
                if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
                    raise ValueError("Cannot run multiple SparkContexts at once")
                else:
                    SparkContext._active_spark_context = instance

Example #8

0

Show file

File: instant_testing.py Project: luzbetak/sparkly

    def get_context(cls):
        """Get the current global spark context.

        Returns:
            pyspark.SparkContext or None (if wasn't set before).
        """
        assert cls.is_activated()

        state = None

        with open(cls.LOCK_FILE_PATH) as lock:
            serialised_state = lock.read()
            if serialised_state:
                try:
                    state = json.loads(serialised_state)
                except ValueError:
                    logger.error(
                        'Unable to deserialize lock file. Try to reactivate instant testing. '
                        'The broken content is: %s',
                        serialised_state,
                    )

        if state:
            logger.info(
                'Recovering context for the instant testing [pid=%s, gateway=%s]',
                state['session_pid'],
                state['gateway_port'],
            )

            os.environ['PYSPARK_GATEWAY_PORT'] = str(state['gateway_port'])
            os.environ['PYSPARK_GATEWAY_SECRET'] = str(state['gateway_secret'])
            gateway = launch_gateway()
            java_import(gateway.jvm, 'org.apache.spark.SparkContext')
            jvm_spark_context = gateway.jvm.SparkContext.getOrCreate()
            jvm_java_spark_context = gateway.jvm.JavaSparkContext(
                jvm_spark_context)

            SparkContext._gateway = gateway
            SparkContext._jvm = gateway.jvm

            return SparkContext(
                appName=jvm_spark_context.appName(),
                master=jvm_spark_context.master(),
                gateway=gateway,
                jsc=jvm_java_spark_context,
            )

Example #9

0

Show file

    def _ensure_initialized(cls, instance=None, gateway=None):
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = gateway or launch_gateway()
                SparkContext._jvm = SparkContext._gateway.jvm
                SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile

            if instance:
                if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
                    currentMaster = SparkContext._active_spark_context.master
                    currentAppName = SparkContext._active_spark_context.appName
                    callsite = SparkContext._active_spark_context._callsite

                    # Raise error if there is already a running Spark context
                    raise ValueError("Cannot run multiple SparkContexts at once; existing SparkContext(app=%s, master=%s)" \
                        " created by %s at %s:%s " \
                        % (currentAppName, currentMaster, callsite.function, callsite.file, callsite.linenum))
                else:
                    SparkContext._active_spark_context = instance

Example #10

0

Show file

File: context.py Project: AndreSchumacher/spark

    def _ensure_initialized(cls, instance=None, gateway=None):
        with SparkContext._lock:
            if not SparkContext._gateway:
                SparkContext._gateway = gateway or launch_gateway()
                SparkContext._jvm = SparkContext._gateway.jvm
                SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile

            if instance:
                if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:
                    currentMaster = SparkContext._active_spark_context.master
                    currentAppName = SparkContext._active_spark_context.appName
                    callsite = SparkContext._active_spark_context._callsite

                    # Raise error if there is already a running Spark context
                    raise ValueError("Cannot run multiple SparkContexts at once; existing SparkContext(app=%s, master=%s)" \
                        " created by %s at %s:%s " \
                        % (currentAppName, currentMaster, callsite.function, callsite.file, callsite.linenum))
                else:
                    SparkContext._active_spark_context = instance

Example #11

0

Show file

def run(*args, **kwargs):
    global GATEWAY

    if GATEWAY == None:
        from pyspark.java_gateway import launch_gateway

        GATEWAY = launch_gateway()
        TunnelProcess(GATEWAY_PORT,
                      GATEWAY.gateway_parameters.port,
                      keep_alive=True)

    if 'debug' not in kwargs or ('debug' in kwargs
                                 and kwargs['debug'] == False):
        app.logger.removeHandler(default_handler)
        app.logger = logger

        logger.info('Starting pyspark gateway server')

    if 'port' not in kwargs:
        kwargs['port'] = HTTP_PORT

    app.run(*args, **kwargs)

Example #12

0

Show file

File: context.py Project: WuErPing/spark

    def __init__(self, master, jobName, sparkHome=None, pyFiles=None,
        environment=None, batchSize=1024):
        """
        Create a new SparkContext.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param jobName: A name for your job, to display on the cluster web UI
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        """
        with SparkContext._lock:
            if SparkContext._active_spark_context:
                raise ValueError("Cannot run multiple SparkContexts at once")
            else:
                SparkContext._active_spark_context = self
                if not SparkContext._gateway:
                    SparkContext._gateway = launch_gateway()
                    SparkContext._jvm = SparkContext._gateway.jvm
                    SparkContext._writeIteratorToPickleFile = \
                        SparkContext._jvm.PythonRDD.writeIteratorToPickleFile
                    SparkContext._takePartition = \
                        SparkContext._jvm.PythonRDD.takePartition
        self.master = master
        self.jobName = jobName
        self.sparkHome = sparkHome or None # None becomes null in Py4J
        self.environment = environment or {}
        self.batchSize = batchSize  # -1 represents a unlimited batch size

        # Create the Java SparkContext through Py4J
        empty_string_array = self._gateway.new_array(self._jvm.String, 0)
        self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
                                              empty_string_array)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
                self._jvm.java.util.ArrayList(),
                self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        # Deploy any code dependencies specified in the constructor
        for path in (pyFiles or []):
            self.addPyFile(path)
        SparkFiles._sc = self
        sys.path.append(SparkFiles.getRootDirectory())

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.spark.Utils.getLocalDir()
        self._temp_dir = \
            self._jvm.spark.Utils.createTempDir(local_dir).getAbsolutePath()

Example #13

0

Show file

File: gen-sql-functions-docs.py Project: zhengruifeng/spark

    Generates a HTML file after listing and executing the function information.
    The output file is created under `html_output_dir`.

    Expected output:

    <div class="codehilite"><pre><span></span>
      <span class="c1">-- func</span>
      <span class="k">SELECT</span>
      ...
    </pre></div>

    """
    print("Running SQL examples to generate formatted output.")
    for key, infos in _list_grouped_function_infos(jvm):
        examples = _make_pretty_examples(jspark, infos)
        key = key.replace("_", "-")
        if examples is not None:
            with open("%s/generated-%s-examples.html" % (html_output_dir, key),
                      'w') as examples_html:
                examples_html.write(examples)


if __name__ == "__main__":
    jvm = launch_gateway().jvm
    jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
    jspark.sparkContext().setLogLevel("ERROR")  # Make it less noisy.
    spark_root_dir = os.path.dirname(os.path.dirname(__file__))
    html_output_dir = os.path.join(spark_root_dir, "docs")
    generate_functions_table_html(jvm, html_output_dir)
    generate_functions_examples_html(jvm, jspark, html_output_dir)

Example #14

0

Show file

def get_hadoop_version() -> str:
    """
    Get the HADOOP version pyspark will use
    """
    return launch_gateway().jvm.org.apache.hadoop.util.VersionInfo.getVersion()

Example #15

0

Show file

File: context.py Project: fightingBilling/spark-1

    def __init__(self,
                 master,
                 jobName,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=1024):
        """
        Create a new SparkContext.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param jobName: A name for your job, to display on the cluster web UI
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        """
        with SparkContext._lock:
            if SparkContext._active_spark_context:
                raise ValueError("Cannot run multiple SparkContexts at once")
            else:
                SparkContext._active_spark_context = self
                if not SparkContext._gateway:
                    SparkContext._gateway = launch_gateway()
                    SparkContext._jvm = SparkContext._gateway.jvm
                    SparkContext._writeIteratorToPickleFile = \
                        SparkContext._jvm.PythonRDD.writeIteratorToPickleFile
                    SparkContext._takePartition = \
                        SparkContext._jvm.PythonRDD.takePartition
        self.master = master
        self.jobName = jobName
        self.sparkHome = sparkHome or None  # None becomes null in Py4J
        self.environment = environment or {}
        self.batchSize = batchSize  # -1 represents a unlimited batch size

        # Create the Java SparkContext through Py4J
        empty_string_array = self._gateway.new_array(self._jvm.String, 0)
        self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
                                               empty_string_array)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        # Deploy any code dependencies specified in the constructor
        for path in (pyFiles or []):
            self.addPyFile(path)
        SparkFiles._sc = self
        sys.path.append(SparkFiles.getRootDirectory())

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.spark.Utils.getLocalDir()
        self._temp_dir = \
            self._jvm.spark.Utils.createTempDir(local_dir).getAbsolutePath()

Example #16

0

Show file

File: smvapp.py Project: bakhalea/SMV

    def __init__(self, arglist, _sparkSession, py_module_hotload=True):
        self.smvHome = os.environ.get("SMV_HOME")
        if (self.smvHome is None):
            raise SmvRuntimeError("SMV_HOME env variable not set!")

        self.sparkSession = _sparkSession

        if (self.sparkSession is not None):
            sc = self.sparkSession.sparkContext
            sc.setLogLevel("ERROR")

            self.sc = sc
            self.sqlContext = self.sparkSession._wrapped
            self._jvm = sc._jvm
            self.j_smvPyClient = self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(
                self.sparkSession._jsparkSession)
            self.j_smvApp = self.j_smvPyClient.j_smvApp()
        else:
            _gw = launch_gateway(None)
            self._jvm = _gw.jvm

        self.py_module_hotload = py_module_hotload

        java_import(self._jvm, "org.tresamigos.smv.ColumnHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper")
        java_import(self._jvm, "org.tresamigos.smv.dqm.*")
        java_import(self._jvm, "org.tresamigos.smv.panel.*")
        java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvHDFS")
        java_import(self._jvm, "org.tresamigos.smv.DfCreator")

        self.smvSchemaObj = self._jvm.SmvPythonHelper.getSmvSchema()

        self.py_smvconf = SmvConfig(arglist, self._jvm)

        # configure spark sql params
        if (self.sparkSession is not None):
            for k, v in self.py_smvconf.spark_sql_props().items():
                self.sqlContext.setConf(k, v)

        # issue #429 set application name from smv config
        if (self.sparkSession is not None):
            sc._conf.setAppName(self.appName())

        # CmdLine is static, so can be an attribute
        cl = self.py_smvconf.cmdline
        self.cmd_line = namedtuple("CmdLine", cl.keys())(*cl.values())

        # shortcut is meant for internal use only
        self.dsm = DataSetMgr(self._jvm, self.py_smvconf)

        # computed df cache, keyed by m.versioned_fqn
        self.data_cache = {}

        # AFTER app is available but BEFORE stages,
        # use the dynamically configured app dir to set the source path, library path
        self.prependDefaultDirs()

        self.repoFactory = DataSetRepoFactory(self)
        self.dsm.register(self.repoFactory)

        # provider cache, keyed by providers' fqn
        self.provider_cache = {}
        self.refresh_provider_cache()

        # Initialize DataFrame and Column with helper methods
        smv.helpers.init_helpers()

Example #17

0

Show file

File: test_broadcast.py Project: Ditto0/Sparks

 def setUpClass(cls):
     gateway = launch_gateway(SparkConf())
     cls._jvm = gateway.jvm
     cls.longMessage = True
     random.seed(42)

Example #18

0

Show file

File: test_broadcast.py Project: JkSelf/spark

 def setUpClass(cls):
     gateway = launch_gateway(SparkConf())
     cls._jvm = gateway.jvm
     cls.longMessage = True
     random.seed(42)

Example #19

0

Show file

File: smvapp.py Project: TresAmigosSD/SMV

    def __init__(self, arglist, _sparkSession, py_module_hotload=True):
        self.smvHome = os.environ.get("SMV_HOME")
        if (self.smvHome is None):
            raise SmvRuntimeError("SMV_HOME env variable not set!")

        self.sparkSession = _sparkSession

        if (self.sparkSession is not None):
            sc = self.sparkSession.sparkContext
            sc.setLogLevel("ERROR")

            self.sc = sc
            self.sqlContext = self.sparkSession._wrapped
            self._jvm = sc._jvm
            self.j_smvPyClient = self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(self.sparkSession._jsparkSession)
            self.j_smvApp = self.j_smvPyClient.j_smvApp()
        else:
            _gw = launch_gateway(None)
            self._jvm = _gw.jvm

        self.py_module_hotload = py_module_hotload

        java_import(self._jvm, "org.tresamigos.smv.ColumnHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper")
        java_import(self._jvm, "org.tresamigos.smv.dqm.*")
        java_import(self._jvm, "org.tresamigos.smv.panel.*")
        java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvHDFS")
        java_import(self._jvm, "org.tresamigos.smv.DfCreator")

        self.smvSchemaObj = self._jvm.SmvPythonHelper.getSmvSchema()

        self.py_smvconf = SmvConfig(arglist)

        # configure spark sql params
        if (self.sparkSession is not None):
            for k, v in self.py_smvconf.spark_sql_props().items():
                self.sqlContext.setConf(k, v)

        # issue #429 set application name from smv config
        if (self.sparkSession is not None):
            sc._conf.setAppName(self.appName())

        # CmdLine is static, so can be an attribute
        cl = self.py_smvconf.cmdline
        self.cmd_line = namedtuple("CmdLine", cl.keys())(*cl.values())

        # shortcut is meant for internal use only
        self.dsm = DataSetMgr(self._jvm, self.py_smvconf)

        # computed df cache, keyed by m.versioned_fqn
        self.data_cache = {}

        # AFTER app is available but BEFORE stages,
        # use the dynamically configured app dir to set the source path, library path
        self.prependDefaultDirs()

        self.repoFactory = DataSetRepoFactory(self)
        self.dsm.register(self.repoFactory)

        # provider cache, keyed by providers' fqn
        self.provider_cache = {}
        self.refresh_provider_cache()

        # Initialize DataFrame and Column with helper methods
        smv.helpers.init_helpers()

Example #20

0

Show file

print('Finished repartition')
review = spark.read.json(
    'hdfs://master:9000/user/serverteam_1/FinalFrontier/review.json'
).repartition(150)
print('Finished repartition')
bus = spark.read.json(
    'hdfs://master:9000/user/serverteam_1/FinalFrontier/business.json')
print('Finished repartition')
_yyy = review.join(bus, review.business_id == bus.business_id,
                   'inner').drop(bus.business_id).drop(bus.stars)
print('Finished simple join')

print("begin remane columns")
review_business = _yyy.withColumnRenamed(
    "stars",
    "business_avg_stars").withColumnRenamed("attributes",
                                            "business_type").withColumnRenamed(
                                                "review_count",
                                                "business_review_count")
print('end review_business creation')
from pyspark import StorageLevel
review_business.persist(StorageLevel.MEMORY_ONLY)
from pyspark.java_gateway import launch_gateway
print("start gateway")
launch_gateway()
import time
start_time = time.time()
print("begin to pandas")
_review_business = review_business.toPandas()
print("--- %s seconds ---" % (time.time() - start_time))