def sparkSession(cls): if not hasattr(cls, "spark"): # We can't use the SparkSession Builder here, since we need to call # Scala side's SmvTestHive.createContext to create the HiveTestContext's # SparkSession. # So we need to # * Create a java_gateway # * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf) # * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir) # * Create Scala side HiveTestContext SparkSession # * Create python SparkSession jgw = launch_gateway(None) jvm = jgw.jvm import tempfile import getpass hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser()) sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\ .set("spark.sql.hive.metastore.barrierPrefixes", "org.apache.spark.sql.hive.execution.PairSerDe")\ .set("spark.sql.warehouse.dir", hivedir)\ .set("spark.ui.enabled", "false") sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate() jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc()) cls.spark = SparkSession(sc, jss.sparkSession()) return cls.spark
def _ensure_initialized(cls, instance=None, gateway=None, conf=None): """ Checks whether a SparkContext is initialized or not. Throws error if a SparkContext is already running. """ with SparkContext._lock: if not SparkContext._gateway: SparkContext._gateway = gateway or launch_gateway(conf) SparkContext._jvm = SparkContext._gateway.jvm if instance: if (SparkContext._active_spark_context and SparkContext._active_spark_context != instance): currentMaster = SparkContext._active_spark_context.master currentAppName = SparkContext._active_spark_context.appName callsite = SparkContext._active_spark_context._callsite # Raise error if there is already a running Spark context raise ValueError( "Cannot run multiple SparkContexts at once; " "existing SparkContext(app=%s, master=%s)" " created by %s at %s:%s " % (currentAppName, currentMaster, callsite.function, callsite.file, callsite.linenum)) else: SparkContext._active_spark_context = instance
def __init__(self): spark_conf = SparkConf() spark_conf.setAppName(spark_nlp_config.app_name) spark_conf.setMaster(spark_nlp_config.master) spark_conf.set("spark.driver.memory", memory) spark_conf.set("spark.serializer", spark_nlp_config.serializer) spark_conf.set("spark.kryoserializer.buffer.max", spark_nlp_config.serializer_max_buffer) spark_conf.set("spark.driver.maxResultSize", spark_nlp_config.driver_max_result_size) if spark32: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark32) elif gpu and spark32: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark32) elif gpu: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark) else: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark) if cache_folder != '': spark_conf.config( "spark.jsl.settings.pretrained.cache_folder", cache_folder) if log_folder != '': spark_conf.config( "spark.jsl.settings.annotator.log_folder", log_folder) if cluster_tmp_dir != '': spark_conf.config( "spark.jsl.settings.storage.cluster_tmp_dir", cluster_tmp_dir) # Make the py4j JVM stdout and stderr available without buffering popen_kwargs = { 'stdout': subprocess.PIPE, 'stderr': subprocess.PIPE, 'bufsize': 0 } # Launch the gateway with our custom settings self.gateway = launch_gateway(conf=spark_conf, popen_kwargs=popen_kwargs) self.process = self.gateway.proc # Use the gateway we launched spark_context = SparkContext(gateway=self.gateway) self.spark_session = SparkSession(spark_context) self.out_thread = threading.Thread(target=self.output_reader) self.error_thread = threading.Thread(target=self.error_reader) self.std_background_listeners()
def _ensure_initialized(cls, instance=None): with SparkContext._lock: if not SparkContext._gateway: SparkContext._gateway = launch_gateway() SparkContext._jvm = SparkContext._gateway.jvm SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile if instance: if SparkContext._active_spark_context and SparkContext._active_spark_context != instance: raise ValueError("Cannot run multiple SparkContexts at once") else: SparkContext._active_spark_context = instance
def _ensure_initialized(cls, instance=None, gateway=None): with SparkContext._lock: if not SparkContext._gateway: SparkContext._gateway = gateway or launch_gateway() SparkContext._jvm = SparkContext._gateway.jvm SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile if instance: if SparkContext._active_spark_context and SparkContext._active_spark_context != instance: raise ValueError("Cannot run multiple SparkContexts at once") else: SparkContext._active_spark_context = instance
def _ensure_initialized(cls, instance=None): with SparkContext._lock: if not SparkContext._gateway: SparkContext._gateway = launch_gateway() SparkContext._jvm = SparkContext._gateway.jvm SparkContext._writeIteratorToPickleFile = \ SparkContext._jvm.PythonRDD.writeIteratorToPickleFile SparkContext._takePartition = \ SparkContext._jvm.PythonRDD.takePartition if instance: if SparkContext._active_spark_context and SparkContext._active_spark_context != instance: raise ValueError("Cannot run multiple SparkContexts at once") else: SparkContext._active_spark_context = instance
def get_context(cls): """Get the current global spark context. Returns: pyspark.SparkContext or None (if wasn't set before). """ assert cls.is_activated() state = None with open(cls.LOCK_FILE_PATH) as lock: serialised_state = lock.read() if serialised_state: try: state = json.loads(serialised_state) except ValueError: logger.error( 'Unable to deserialize lock file. Try to reactivate instant testing. ' 'The broken content is: %s', serialised_state, ) if state: logger.info( 'Recovering context for the instant testing [pid=%s, gateway=%s]', state['session_pid'], state['gateway_port'], ) os.environ['PYSPARK_GATEWAY_PORT'] = str(state['gateway_port']) os.environ['PYSPARK_GATEWAY_SECRET'] = str(state['gateway_secret']) gateway = launch_gateway() java_import(gateway.jvm, 'org.apache.spark.SparkContext') jvm_spark_context = gateway.jvm.SparkContext.getOrCreate() jvm_java_spark_context = gateway.jvm.JavaSparkContext( jvm_spark_context) SparkContext._gateway = gateway SparkContext._jvm = gateway.jvm return SparkContext( appName=jvm_spark_context.appName(), master=jvm_spark_context.master(), gateway=gateway, jsc=jvm_java_spark_context, )
def _ensure_initialized(cls, instance=None, gateway=None): with SparkContext._lock: if not SparkContext._gateway: SparkContext._gateway = gateway or launch_gateway() SparkContext._jvm = SparkContext._gateway.jvm SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile if instance: if SparkContext._active_spark_context and SparkContext._active_spark_context != instance: currentMaster = SparkContext._active_spark_context.master currentAppName = SparkContext._active_spark_context.appName callsite = SparkContext._active_spark_context._callsite # Raise error if there is already a running Spark context raise ValueError("Cannot run multiple SparkContexts at once; existing SparkContext(app=%s, master=%s)" \ " created by %s at %s:%s " \ % (currentAppName, currentMaster, callsite.function, callsite.file, callsite.linenum)) else: SparkContext._active_spark_context = instance
def run(*args, **kwargs): global GATEWAY if GATEWAY == None: from pyspark.java_gateway import launch_gateway GATEWAY = launch_gateway() TunnelProcess(GATEWAY_PORT, GATEWAY.gateway_parameters.port, keep_alive=True) if 'debug' not in kwargs or ('debug' in kwargs and kwargs['debug'] == False): app.logger.removeHandler(default_handler) app.logger = logger logger.info('Starting pyspark gateway server') if 'port' not in kwargs: kwargs['port'] = HTTP_PORT app.run(*args, **kwargs)
def __init__(self, master, jobName, sparkHome=None, pyFiles=None, environment=None, batchSize=1024): """ Create a new SparkContext. @param master: Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). @param jobName: A name for your job, to display on the cluster web UI @param sparkHome: Location where Spark is installed on cluster nodes. @param pyFiles: Collection of .zip or .py files to send to the cluster and add to PYTHONPATH. These can be paths on the local file system or HDFS, HTTP, HTTPS, or FTP URLs. @param environment: A dictionary of environment variables to set on worker nodes. @param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching or -1 to use an unlimited batch size. """ with SparkContext._lock: if SparkContext._active_spark_context: raise ValueError("Cannot run multiple SparkContexts at once") else: SparkContext._active_spark_context = self if not SparkContext._gateway: SparkContext._gateway = launch_gateway() SparkContext._jvm = SparkContext._gateway.jvm SparkContext._writeIteratorToPickleFile = \ SparkContext._jvm.PythonRDD.writeIteratorToPickleFile SparkContext._takePartition = \ SparkContext._jvm.PythonRDD.takePartition self.master = master self.jobName = jobName self.sparkHome = sparkHome or None # None becomes null in Py4J self.environment = environment or {} self.batchSize = batchSize # -1 represents a unlimited batch size # Create the Java SparkContext through Py4J empty_string_array = self._gateway.new_array(self._jvm.String, 0) self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome, empty_string_array) # Create a single Accumulator in Java that we'll send all our updates through; # they will be passed back to us through a TCP server self._accumulatorServer = accumulators._start_update_server() (host, port) = self._accumulatorServer.server_address self._javaAccumulator = self._jsc.accumulator( self._jvm.java.util.ArrayList(), self._jvm.PythonAccumulatorParam(host, port)) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') # Broadcast's __reduce__ method stores Broadcast instances here. # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to # send. self._pickled_broadcast_vars = set() # Deploy any code dependencies specified in the constructor for path in (pyFiles or []): self.addPyFile(path) SparkFiles._sc = self sys.path.append(SparkFiles.getRootDirectory()) # Create a temporary directory inside spark.local.dir: local_dir = self._jvm.spark.Utils.getLocalDir() self._temp_dir = \ self._jvm.spark.Utils.createTempDir(local_dir).getAbsolutePath()
Generates a HTML file after listing and executing the function information. The output file is created under `html_output_dir`. Expected output: <div class="codehilite"><pre><span></span> <span class="c1">-- func</span> <span class="k">SELECT</span> ... </pre></div> """ print("Running SQL examples to generate formatted output.") for key, infos in _list_grouped_function_infos(jvm): examples = _make_pretty_examples(jspark, infos) key = key.replace("_", "-") if examples is not None: with open("%s/generated-%s-examples.html" % (html_output_dir, key), 'w') as examples_html: examples_html.write(examples) if __name__ == "__main__": jvm = launch_gateway().jvm jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate() jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy. spark_root_dir = os.path.dirname(os.path.dirname(__file__)) html_output_dir = os.path.join(spark_root_dir, "docs") generate_functions_table_html(jvm, html_output_dir) generate_functions_examples_html(jvm, jspark, html_output_dir)
def get_hadoop_version() -> str: """ Get the HADOOP version pyspark will use """ return launch_gateway().jvm.org.apache.hadoop.util.VersionInfo.getVersion()
def __init__(self, arglist, _sparkSession, py_module_hotload=True): self.smvHome = os.environ.get("SMV_HOME") if (self.smvHome is None): raise SmvRuntimeError("SMV_HOME env variable not set!") self.sparkSession = _sparkSession if (self.sparkSession is not None): sc = self.sparkSession.sparkContext sc.setLogLevel("ERROR") self.sc = sc self.sqlContext = self.sparkSession._wrapped self._jvm = sc._jvm self.j_smvPyClient = self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init( self.sparkSession._jsparkSession) self.j_smvApp = self.j_smvPyClient.j_smvApp() else: _gw = launch_gateway(None) self._jvm = _gw.jvm self.py_module_hotload = py_module_hotload java_import(self._jvm, "org.tresamigos.smv.ColumnHelper") java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper") java_import(self._jvm, "org.tresamigos.smv.dqm.*") java_import(self._jvm, "org.tresamigos.smv.panel.*") java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper") java_import(self._jvm, "org.tresamigos.smv.SmvHDFS") java_import(self._jvm, "org.tresamigos.smv.DfCreator") self.smvSchemaObj = self._jvm.SmvPythonHelper.getSmvSchema() self.py_smvconf = SmvConfig(arglist, self._jvm) # configure spark sql params if (self.sparkSession is not None): for k, v in self.py_smvconf.spark_sql_props().items(): self.sqlContext.setConf(k, v) # issue #429 set application name from smv config if (self.sparkSession is not None): sc._conf.setAppName(self.appName()) # CmdLine is static, so can be an attribute cl = self.py_smvconf.cmdline self.cmd_line = namedtuple("CmdLine", cl.keys())(*cl.values()) # shortcut is meant for internal use only self.dsm = DataSetMgr(self._jvm, self.py_smvconf) # computed df cache, keyed by m.versioned_fqn self.data_cache = {} # AFTER app is available but BEFORE stages, # use the dynamically configured app dir to set the source path, library path self.prependDefaultDirs() self.repoFactory = DataSetRepoFactory(self) self.dsm.register(self.repoFactory) # provider cache, keyed by providers' fqn self.provider_cache = {} self.refresh_provider_cache() # Initialize DataFrame and Column with helper methods smv.helpers.init_helpers()
def setUpClass(cls): gateway = launch_gateway(SparkConf()) cls._jvm = gateway.jvm cls.longMessage = True random.seed(42)
def __init__(self, arglist, _sparkSession, py_module_hotload=True): self.smvHome = os.environ.get("SMV_HOME") if (self.smvHome is None): raise SmvRuntimeError("SMV_HOME env variable not set!") self.sparkSession = _sparkSession if (self.sparkSession is not None): sc = self.sparkSession.sparkContext sc.setLogLevel("ERROR") self.sc = sc self.sqlContext = self.sparkSession._wrapped self._jvm = sc._jvm self.j_smvPyClient = self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(self.sparkSession._jsparkSession) self.j_smvApp = self.j_smvPyClient.j_smvApp() else: _gw = launch_gateway(None) self._jvm = _gw.jvm self.py_module_hotload = py_module_hotload java_import(self._jvm, "org.tresamigos.smv.ColumnHelper") java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper") java_import(self._jvm, "org.tresamigos.smv.dqm.*") java_import(self._jvm, "org.tresamigos.smv.panel.*") java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper") java_import(self._jvm, "org.tresamigos.smv.SmvHDFS") java_import(self._jvm, "org.tresamigos.smv.DfCreator") self.smvSchemaObj = self._jvm.SmvPythonHelper.getSmvSchema() self.py_smvconf = SmvConfig(arglist) # configure spark sql params if (self.sparkSession is not None): for k, v in self.py_smvconf.spark_sql_props().items(): self.sqlContext.setConf(k, v) # issue #429 set application name from smv config if (self.sparkSession is not None): sc._conf.setAppName(self.appName()) # CmdLine is static, so can be an attribute cl = self.py_smvconf.cmdline self.cmd_line = namedtuple("CmdLine", cl.keys())(*cl.values()) # shortcut is meant for internal use only self.dsm = DataSetMgr(self._jvm, self.py_smvconf) # computed df cache, keyed by m.versioned_fqn self.data_cache = {} # AFTER app is available but BEFORE stages, # use the dynamically configured app dir to set the source path, library path self.prependDefaultDirs() self.repoFactory = DataSetRepoFactory(self) self.dsm.register(self.repoFactory) # provider cache, keyed by providers' fqn self.provider_cache = {} self.refresh_provider_cache() # Initialize DataFrame and Column with helper methods smv.helpers.init_helpers()
print('Finished repartition') review = spark.read.json( 'hdfs://master:9000/user/serverteam_1/FinalFrontier/review.json' ).repartition(150) print('Finished repartition') bus = spark.read.json( 'hdfs://master:9000/user/serverteam_1/FinalFrontier/business.json') print('Finished repartition') _yyy = review.join(bus, review.business_id == bus.business_id, 'inner').drop(bus.business_id).drop(bus.stars) print('Finished simple join') print("begin remane columns") review_business = _yyy.withColumnRenamed( "stars", "business_avg_stars").withColumnRenamed("attributes", "business_type").withColumnRenamed( "review_count", "business_review_count") print('end review_business creation') from pyspark import StorageLevel review_business.persist(StorageLevel.MEMORY_ONLY) from pyspark.java_gateway import launch_gateway print("start gateway") launch_gateway() import time start_time = time.time() print("begin to pandas") _review_business = review_business.toPandas() print("--- %s seconds ---" % (time.time() - start_time))