def _create_shell_session() -> "SparkSession": """ Initialize a :class:`SparkSession` for a pyspark shell session. This is called from shell.py to make error handling simpler without needing to declare local variables in that script, which would expose those to users. """ import py4j from pyspark.conf import SparkConf from pyspark.context import SparkContext try: # Try to access HiveConf, it will raise exception if Hive is not added conf = SparkConf() if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive': (SparkContext._jvm # type: ignore[attr-defined] .org.apache.hadoop.hive.conf.HiveConf()) return SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() else: return SparkSession.builder.getOrCreate() except (py4j.protocol.Py4JError, TypeError): if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive': warnings.warn("Fall back to non-hive support because failing to access HiveConf, " "please make sure you build spark with hive") return SparkSession.builder.getOrCreate()
def configure(self, opts, ports): """ Initializes Spark configuration object """ # Check if there's already a conf variablex # If using SparkMonitor, this is defined but is of type SparkConf conf = self.connector.ipython.user_ns.get('swan_spark_conf') if conf: self.connector.log.warn("conf already exists: %s", conf.toDebugString()) if not isinstance(conf, SparkConf): raise Exception('There is already a "swan_spark_conf" variable defined and is not of type SparkConf.') else: conf = SparkConf() # Create a new conf options = self._parse_options(opts) # Do not overwrite the existing driver extraClassPath with option, add instead def_conf_extra_class_path = conf.get('spark.driver.extraClassPath', '') options_extra_class_path = options.get('spark.driver.extraClassPath', '') if def_conf_extra_class_path != '' and options_extra_class_path != '': options['spark.driver.extraClassPath'] = def_conf_extra_class_path + ":" + options_extra_class_path elif def_conf_extra_class_path != '' and options_extra_class_path == '': options['spark.driver.extraClassPath'] = def_conf_extra_class_path elif def_conf_extra_class_path == '' and options_extra_class_path != '': options['spark.driver.extraClassPath'] = options_extra_class_path # Add options to the default conf for name, value in options.items(): conf.set(name, value) # Extend conf adding logging of log4j to java options base_extra_java_options = "-Dlog4j.configuration=file:%s" % self.connector.log4j_file extra_java_options = conf.get("spark.driver.extraJavaOptions") if extra_java_options: extra_java_options = base_extra_java_options + " " + extra_java_options else: extra_java_options = base_extra_java_options conf.set("spark.driver.extraJavaOptions", extra_java_options) # Extend conf ensuring that LD_LIBRARY_PATH on executors is the same as on the driver ld_library_path = conf.get('spark.executorEnv.LD_LIBRARY_PATH') if ld_library_path: ld_library_path = ld_library_path + ":" + os.environ.get('LD_LIBRARY_PATH', '') else: ld_library_path = os.environ.get('LD_LIBRARY_PATH', '') conf.set('spark.executorEnv.LD_LIBRARY_PATH', ld_library_path) # Extend conf with ports for the driver and block manager conf.set('spark.driver.host', os.environ.get('SERVER_HOSTNAME', 'localhost')) conf.set('spark.driver.port', ports[0]) conf.set('spark.driver.blockManager.port', ports[1]) conf.set('spark.port.maxRetries', 100) conf.set('spark.ui.port', ports[2]) # Extend conf with spark app name to allow the monitoring and filtering of SWAN jobs in the Spark clusters app_name = conf.get('spark.app.name') conf.set('spark.app.name', app_name + '_swan' if app_name else 'pyspark_shell_swan') return conf
def __call__(self): log.info('Processing wiki dump: %s ...', self.wk_dump_path) c = SparkConf().setAppName('Wikijson') log.info('Using spark master: %s', c.get('spark.master')) sc = SparkContext(conf=c) if os.path.isdir(self.output_path): log.warn('Writing over output path: %s', self.output_path) shutil.rmtree(self.output_path) # rdd of tuples: (title, namespace, id, redirect, content) pages = wikispark.get_pages_from_wikidump(sc, self.wk_dump_path) pages.cache() articles = wikispark.get_articles_from_pages(pages) redirects = wikispark.get_redirects_from_pages(pages) if self.redirect_links: articles = wikispark.redirect_article_links(articles, redirects) articles.map(self.article_to_json)\ .map(json.dumps)\ .saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec') log.info('Done.')
def __call__(self): log.info("Processing wiki dump: %s ...", self.wk_dump_path) c = SparkConf().setAppName("Wikijson") log.info("Using spark master: %s", c.get("spark.master")) sc = SparkContext(conf=c) if os.path.isdir(self.output_path): log.warn("Writing over output path: %s", self.output_path) shutil.rmtree(self.output_path) # rdd of tuples: (title, namespace, id, redirect, content) pages = wikispark.get_pages_from_wikidump(sc, self.wk_dump_path) pages.cache() articles = wikispark.get_articles_from_pages(pages) redirects = wikispark.get_redirects_from_pages(pages) if self.redirect_links: articles = wikispark.redirect_article_links(articles, redirects) articles.map(self.article_to_json).map(json.dumps).saveAsTextFile( self.output_path, "org.apache.hadoop.io.compress.GzipCodec" ) log.info("Done.")
def get_default_spark_conf(additional_conf=None): if additional_conf is None: additional_conf = {} conf = SparkConf(). \ setAppName("pyunit-test"). \ setMaster("local[*]"). \ set("spark.driver.memory", "2g"). \ set("spark.executor.memory", "2g"). \ set("spark.ext.h2o.client.log.level", "DEBUG"). \ set("spark.ext.h2o.repl.enabled", "false"). \ set("spark.task.maxFailures", "1"). \ set("spark.rpc.numRetries", "1"). \ set("spark.deploy.maxExecutorRetries", "1"). \ set("spark.network.timeout", "360s"). \ set("spark.worker.timeout", "360"). \ set("spark.ext.h2o.cloud.name", unique_cloud_name("test")). \ set("spark.ext.h2o.external.start.mode", "auto"). \ set("spark.ext.h2o.node.log.dir", "build/h2ologs-pyunit/workers"). \ set("spark.ext.h2o.client.log.dir", "build/h2ologs-pyunit/client") for key in additional_conf: conf.set(key, additional_conf[key]) if conf.get("spark.ext.h2o.backend.cluster.mode") == "external": conf.set("spark.ext.h2o.client.ip", local_ip()) conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "1") return conf
def main(): conf = SparkConf() dateStr = conf.get('spark.date') sc = SparkContext(conf=conf, appName='Loc City Data Prepare, ' + dateStr) hc = HiveContext(sc) sqlDict = prepareSql(dateStr) #mergedRdd = sc.emptyRDD() mergedRdd = sc.parallelize([]) for prod, sql in sqlDict.items(): print sql df = hc.sql(sql) #print 'df count:', df.count() rdd = df.map(lambda x: toCityLoc(x, prod)) rdd = rdd.filter(lambda x: x[0] is not None) rdd = rdd.map(lambda x: x[0]) mergedRdd = mergedRdd.union(rdd) #break mergedRdd.cache() print 'mergedRdd count:', mergedRdd.count() fromRdd = mergedRdd.map(lambda cityLoc: ( (cityLoc.area, cityLoc.fromPoi.displayName), (cityLoc.fromPoi, 1L))) toRdd = mergedRdd.map(lambda cityLoc: ( (cityLoc.area, cityLoc.toPoi.displayName), (cityLoc.toPoi, 1L))) count(fromRdd, dateStr, 'from') count(toRdd, dateStr, 'to') print 'success' sc.stop()
def main(): conf = SparkConf() sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) cmdargs = conf.get('spark.pythonargs') parser = argparse.ArgumentParser(description="Image to Caption Util") parser.add_argument('-input', action="store", dest="input") parser.add_argument('-model', action="store", dest="model") parser.add_argument('-imagenet', action="store", dest="imagenet") parser.add_argument('-lstmnet', action="store", dest="lstmnet") parser.add_argument('-vocab', action="store", dest="vocab") parser.add_argument('-output', action="store", dest="output") args=parser.parse_args(cmdargs.split(" ")) df_input = sqlContext.read.parquet(str(args.input)) images = df_input.select("data.image","data.height", "data.width", "id") df=get_predictions(sqlContext, images, str(args.model), str(args.imagenet), str(args.lstmnet), str(args.vocab)) df.write.json(str(args.output))
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) if not is_spark_below_2_2(): extend_spark_driver_cp(sparkConf, get_bigdl_classpath()) # add content in PYSPARK_FILES in spark.submit.pyFiles # This is a workaround for current Spark on k8s python_lib = os.environ.get('PYSPARK_FILES', None) if python_lib: existing_py_files = sparkConf.get("spark.submit.pyFiles") if existing_py_files: sparkConf.set(key="spark.submit.pyFiles", value="%s,%s" % (python_lib, existing_py_files)) else: sparkConf.set(key="spark.submit.pyFiles", value=python_lib) return sparkConf
def __call__(self): c = SparkConf().setAppName("Build %s" % self.model_name) log.info("Using spark master: %s", c.get("spark.master")) sc = SparkContext(conf=c) kwargs = self.model.prepare(sc) m = self.model.build(**kwargs) m = self.model.format_items(m) m = self.formatter(m) if self.output_path: log.info("Saving to: %s", self.output_path) if os.path.isdir(self.output_path): log.warn("Writing over output path: %s", self.output_path) shutil.rmtree(self.output_path) m.saveAsTextFile(self.output_path, "org.apache.hadoop.io.compress.GzipCodec") elif self.sample > 0: print "\n".join(str(i) for i in m.take(self.sample)) log.info("Done.")
def main(): conf = SparkConf() dateStr = conf.get('spark.date') sc = SparkContext(appName='get sug poi of today from log table', conf=conf) hc = HiveContext(sc) sql = '''select log.param['json_str'] as json_str from pbs_dw.ods_log_ws_addrsuggestion as log where concat(year,month,day)=dateStr''' sql = sql.replace('dateStr', dateStr) print sql df = hc.sql(sql) rdd = df.flatMap(lambda x: format(x)).reduceByKey( lambda x, y: x, 200).map(lambda x: x[0] + '\t' + x[1]) fPath = POI_DAILY_ROOT + '/' + 'poilist.' + dateStr print fPath print IoHelper.deleteFileInHDFS(fPath) rdd.saveAsTextFile(fPath) sc.stop() print 'over'
def create_spark_conf(): bigdl_conf = get_bigdl_conf() sparkConf = SparkConf() sparkConf.setAll(bigdl_conf.items()) if os.environ.get("BIGDL_JARS", None) and not is_spark_below_2_2(): for jar in os.environ["BIGDL_JARS"].split(":"): extend_spark_driver_cp(sparkConf, jar) # add content in PYSPARK_FILES in spark.submit.pyFiles # This is a workaround for current Spark on k8s python_lib = os.environ.get("PYSPARK_FILES", None) if python_lib: existing_py_files = sparkConf.get("spark.submit.pyFiles") if existing_py_files: sparkConf.set( key="spark.submit.pyFiles", value="%s,%s" % (python_lib, existing_py_files), ) else: sparkConf.set(key="spark.submit.pyFiles", value=python_lib) return sparkConf
def __call__(self): c = SparkConf().setAppName('Build %s' % self.model_name) log.info('Using spark master: %s', c.get('spark.master')) sc = SparkContext(conf=c) kwargs = self.model.prepare(sc) m = self.model.build(**kwargs) m = self.model.format_items(m) m = self.formatter(m) if self.output_path: log.info("Saving to: %s", self.output_path) if os.path.isdir(self.output_path): log.warn('Writing over output path: %s', self.output_path) shutil.rmtree(self.output_path) m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec') elif self.sample > 0: print '\n'.join(str(i) for i in m.take(self.sample)) log.info('Done.')
def __call__(self): log.info('Processing corpus: %s ...', self.corpus_path) c = SparkConf().setAppName('Build %s' % self.model_name) log.info('Using spark master: %s', c.get('spark.master')) sc = SparkContext(conf=c) corpus = sc.textFile(self.corpus_path).map(json.loads) m = self.model.build(corpus) m = self.model.format(m) if self.sample > 0: if self.sort: m = m.map(lambda (k,v): (v,k)).sortByKey(False) print '\n'.join(str(i) for i in m.take(self.sample)) elif self.output_path: log.info("Saving to: %s", self.output_path) if os.path.isdir(self.output_path): log.warn('Writing over output path: %s', self.output_path) shutil.rmtree(self.output_path) m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec') log.info('Done.')
#================================================== appName = "Kafka_to_HBase" config = SparkConf().setAppName(appName) props = [] props.append(("spark.rememberDuration", "10")) props.append(("spark.batchDuration", "10")) props.append(("spark.eventLog.enabled", "true")) props.append(("spark.streaming.timeout", "30")) props.append(("spark.ui.enabled", "true")) config = config.setAll(props) sc = SparkContext(conf=config) ssc = StreamingContext(sc, int(config.get("spark.batchDuration"))) #================================================== # Main application execution function #================================================== def runApplication(ssc, config): ssc.start() if config.get("spark.streaming.timeout") == '': ssc.awaitTermination() else: stopped = ssc.awaitTerminationOrTimeout(int(config.get("spark.streaming.timeout"))) if not stopped : print("Stopping streaming context after timeout...") ssc.stop(True) print("Streaming context stopped.")
df_dataset = sqlContext.read.format("jdbc").options( url=url, driver=driver, dbtable=dbtable, user=user, password=password).load() return df_dataset try: conf = SparkConf().setAppName("Spark_ETL") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) conf = ConfigParser.ConfigParser() conf.read("param.config") url = conf.get("MySQL", "url") driver = conf.get("MySQL", "driver") dbtable_A = conf.get("MySQL", "dbtable_A") dbtable_B = conf.get("MySQL", "dbtable_B") user = conf.get("MySQL", "user") password = conf.get("MySQL", "password") HiveSchema = conf.get("HiveSchema", "schema") print(url, driver, dbtable_A, dbtable_B, user, password) #check if we have received arguments as inputs to the script strWeek = sys.argv[1] if len(sys.argv) == 2 else "all_weeks" #logic to derive where clause based on input provided strSql = "" if strWeek == "all_weeks" else " where week = \'" + strWeek + "\'"
'\t' + res) return abnormal_features if __name__ == "__main__": sparkConf = SparkConf() sparkConf.setAppName("dagang abnormal segment") sparkConf.set("spark.kryoserializer.buffer.max", "128") sc = SparkContext(conf=sparkConf) sc.setLogLevel("WARN") sqlCtx = HiveContext(sc) sqlCtx.setConf("spark.sql.parquet.binaryAsString", "true") sqlCtx.setConf("spark.sql.hive.convertMetastoreParquet", "true") sqlCtx.setConf("spark.sql.parquet.int96AsTimestamp", "true") executor_cores = int(sparkConf.get('spark.executor.cores')) num_executors = int(sparkConf.get('spark.executor.instances')) num_partitions = executor_cores * num_executors * 3 features = [ 'area', 'down_oscillation', 'down_stroke', 'down_stroke_ratio', 'down_stroke_zaihe', 'down_up_oscillation_ratio', 'down_up_stroke_zaihe_ratio', 'down_up_zaihe_ratio', 'down_zaihe', 'left_upper_area', 'left_upper_area_ratio', 'max_weiyi', 'max_weiyi_zaihe', 'max_zaihe', 'min_max_zaihe_ratio', 'min_weiyi', 'min_weiyi_zaihe', 'min_zaihe', 'up_oscillation', 'up_stroke', 'up_stroke_ratio', 'up_stroke_zaihe', 'up_zaihe' ] print(current_timestamp(), '-' * 30 + 'starting') abnormal_sql = """
import py4j from pyspark import SparkConf from pyspark.context import SparkContext from pyspark.sql import SparkSession, SQLContext if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() try: # Try to access HiveConf, it will raise exception if Hive is not added conf = SparkConf() if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive': SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() else: spark = SparkSession.builder.getOrCreate() except py4j.protocol.Py4JError: if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive': warnings.warn( "Fall back to non-hive support because failing to access HiveConf, " "please make sure you build spark with hive") spark = SparkSession.builder.getOrCreate() except TypeError: if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive': warnings.warn(
from pyspark import SparkContext from pyspark import SparkConf import sys from pyspark.rdd import RDD def update_dictionary(map): map.update(test="alex") return map if __name__ == "__main__": print("here1") conf = SparkConf() sc = SparkContext(appName="alex_test_app") print("here2") print("here2b: " + conf.get("spark.aleph2_job_config")) aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader().loadClass("com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils").newInstance().getAleph2(sc._jsc, sys.argv[1]) print("here3") print aleph2.getRddInputNames() print("here4") #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count() print("here5") to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m)) aleph2.emitRdd(to_output._to_java_object_rdd())
for row in f: row = row.strip().split(' ') vals = [] for val in row: vals.append(float(val)) ret.append(vals) return ret ### METADATA conf = SparkConf().setAppName('GenerateTracerData') sc = SparkContext(conf=conf) ACCESS_KEY = conf.get('SPARK_ACCESS_KEY') SECRET_KEY = conf.get('SPARK_SECRET_KEY') s3 = s3fs.S3FileSystem(key=ACCESS_KEY, secret=SECRET_KEY) subhalo_id = 89587 subhalo_positions = get_subhalo_pos(s3) print('length subhalo positions') print(len(subhalo_positions)) radius = 140 snaps = range(0, 4380) last_snap = snaps[-1] lastfiles = s3.glob(snap_tofile(last_snap)) blackhole_id, tracer_ids = gen_tracer_ids_blackhole(
run from command line spark-submit --master yarn-client --conf key=value --conf someotherkey=someothervalue you_code.py """ from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row conf = SparkConf().setAppName("hello-world").setMaster('yarn-client') conf.set("spark.files.overwrite","true") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #log log4jLogger = sc._jvm.org.apache.log4j LOG = log4jLogger.LogManager.getLogger("hello.world.spark") LOG.info("Args = " + conf.getAll().__str__()) inputFile = conf.get("spark.input") outputFile = conf.get("spark.output") wordcount = sc.textFile(inputFile).map(lambda line: line.replace("\"", " ").replace("{", " ").replace("}", " ").replace(".", " ").replace(":", " ")) \ .flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .map(lambda (k, v): (v, k)) \ .sortByKey(ascending=False) \ .map(lambda (k, v): (v, k)) df = wordcount.toDF(['word', 'count']) df.save(path=outputFile, source='json', mode='overwrite')
#similarly we can extract valid records set and persist data in respective table by extending ruleEngine definition #df_valid = df_full_data.where("reason_flag is null or log_level = 'warn'") #unpersisting df_full_data and rules dataframes df_full_data.unpersist() rules.unpersist() if __name__ == "__main__": #initilizing SparkConf conf = SparkConf().setAppName('Streaming Rule Engine') #reading property file try: batch_time = int(conf.get('spark.ruleEngine.batchTimeInSec')) except ValueError: print 'ERROR: batch time must be integer value, found string' exit() file_dir_path = conf.get('spark.ruleEngine.streamingFilesDirath') rule_config_db_name = conf.get('spark.ruleEngine.ruleConfigDbName') rule_config_tbl_name = conf.get('spark.ruleEngine.ruleConfigTableName') invalid_rec_db_name = conf.get('spark.ruleEngine.invalidRecordDbName') invalid_rec_tbl_name = conf.get('spark.ruleEngine.invalidRecordTableName') #initilizing spark context, streaming context and hive enabled sql context sc = SparkContext(conf = conf) ssc = StreamingContext(sc, batch_time) sqlContext = HiveContext(sc) #reading files from input file dir (HDFS Loaction)
class CommonSparkContext(object): __metaclass__ = Singleton def __init__(self): """ Create a spark context. The spark configuration is taken from xframes/config.ini and from the values set in SparkInitContext.set() if this has been called. """ # This is placed here because otherwise it causes an error when used in a spark slave. from pyspark import SparkConf, SparkContext, SQLContext, HiveContext # This reads from default.ini and then xframes/config.ini # if they exist. self._env = Environment.create() context = create_spark_config(self._env) verbose = self._env.get_config("xframes", "verbose", "false").lower() == "true" hdfs_user_name = self._env.get_config("webhdfs", "user", "hdfs") os.environ["HADOOP_USER_NAME"] = hdfs_user_name config_pairs = [(k, v) for k, v in context.iteritems()] self._config = SparkConf().setAll(config_pairs) if verbose: print "Spark Config: {}".format(config_pairs) self._sc = SparkContext(conf=self._config) self._sqlc = SQLContext(self._sc) self._hivec = HiveContext(self._sc) self.zip_path = [] version = [int(n) for n in self._sc.version.split(".")] self.status_tracker = self._sc.statusTracker() if cmp(version, [1, 4, 1]) >= 0: self.application_id = self._sc.applicationId else: self.application_id = None if verbose: print "Spark Version: {}".format(self._sc.version) if self.application_id: print "Application Id: {}".format(self.application_id) if not context["spark.master"].startswith("local"): zip_path = self.build_zip(get_xframes_home()) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) trace_flag = self._env.get_config("xframes", "rdd-trace", "false").lower() == "true" XRdd.set_trace(trace_flag) atexit.register(self.close_context) def spark_add_files(self, dirs): """ Adds python files in the given directory or directories. Parameters ---------- dirs: str or list(str) If a str, the pathname to a directory containing a python module. If a list, then it is a list of such directories. The python files in each directory are compiled, packed into a zip, distributed to each spark slave, and placed in PYTHONPATH. This is only done if spark is deployed on a cluster. """ props = self.config() if props.get("spark.master", "local").startswith("local"): return if isinstance(dirs, basestring): dirs = [dirs] for path in dirs: zip_path = self.build_zip(path) if zip_path: self._sc.addPyFile(zip_path) self.zip_path.append(zip_path) def close_context(self): if self._sc: self._sc.stop() self._sc = None for zip_path in self.zip_path: os.remove(zip_path) def config(self): """ Gets the configuration parameters used to initialize the spark context. Returns ------- out : dict A dict of the properties used to initialize the spark context. """ props = self._config.getAll() return {prop[0]: prop[1] for prop in props} def env(self): """ Gets the config environment. Returns ------- out : Environment The environment. This contains all the values from the configuration file(s). """ return self._env def sc(self): """ Gets the spark context. Returns ------- out : SparkContext The spark context. There is a single spark context per process. """ return self._sc def sqlc(self): """ Gets the spark sql context. Returns ------- out : sql.SqlContext The spark sql context. """ return self._sqlc def hivec(self): """ Gets the hive context. Returns ------- out : sql.HiveContext The hive context. """ return self._hivec def version(self): """ Gets the spark version. Returns ------- out: lst[int] The spark version, as a list of integers. """ return [int(n) for n in self._sc.version.split(".")] def jobs(self): """ Get the spark job ID and info for the active jobs. This method would normally be called by another thread from the executing job. Returns ------- out: map(job_id: job_info} A map of the active job IDs and their corresponding job info """ return {job_id: self.status_tracker.getJobInfo(job_id) for job_id in self.status_tracker.getActiveJobIds()} def cluster_mode(self): """ Get the cluster mode of the spark cluster. Returns ------- out: boolean True if spark is running in cluster mode. Cluster mode means that spark is running on a platform separate the program. In practice, cluster mode means that file arguments must be located on a network filesystem such as HDFS or NFS. """ return not self._config.get("spark.master").startswith("local") # noinspection PyBroadException @staticmethod def build_zip(module_dir): # This can fail at writepy if there is something wrong with the files # in xframes. Go ahead anyway, but things will probably fail if this job is # distributed try: tf = NamedTemporaryFile(suffix=".zip", delete=False) z = PyZipFile(tf, "w") z.writepy(module_dir) z.close() return tf.name except: logging.warn("Zip file distribution failed -- workers will not get xframes code.") logging.warn("Check for unexpected files in xframes directory.") return None @staticmethod def spark_context(): """ Returns the spark context. Returns ------- out : pyspark.SparkContext The SparkContext object from spark. """ return CommonSparkContext().sc() @staticmethod def spark_config(): """ Returns the spark cofig parameters. Returns ------- out : list A list of the key-value pairs stored as tuples, used to initialize the spark context. """ return CommonSparkContext().config() @staticmethod def spark_sql_context(): """ Returns the spark sql context. Returns ------- out : pyspark.sql.SQLContext The SQLContext object from spark. """ return CommonSparkContext().sqlc() @staticmethod def hive_context(): """ Returns the hive context. Returns ------- out : pyspark.sql.HiveContext The Hive object from spark. """ return CommonSparkContext().hivec() @staticmethod def spark_version(): """ Gets the spark version. Returns ------- out: list[int] The spark version, as a list of integers. """ return CommonSparkContext().version() @staticmethod def spark_cluster_mode(): """ Gets the cluster mode Returns ------- out: boolean True if spark is running in cluster mode. Cluster mode means that spark is running on a platform separate the program. In practice, cluster mode means that file arguments must be located on a network filesystem such as HDFS or NFS. """ env = Environment.create() config = create_spark_config(env) return not config.get("spark.master").startswith("local")
from pyspark import SparkContext from pyspark import SparkConf import sys from pyspark.rdd import RDD def update_dictionary(map): map.update(test="alex") return map if __name__ == "__main__": print("here1") conf = SparkConf() sc = SparkContext(appName="alex_test_app") print("here2") print("here2b: " + conf.get("spark.aleph2_job_config")) aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader( ).loadClass( "com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils" ).newInstance().getAleph2(sc._jsc, sys.argv[1]) print("here3") print aleph2.getRddInputNames() print("here4") #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count() print("here5") to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m)) aleph2.emitRdd(to_output._to_java_object_rdd())
if __name__ == "__main__": argv = sys.argv master = argv[1] file_path = argv[2] save_path = argv[3] print("file_path = ", file_path) print("save_path = ", save_path) if 'xml' in file_path or 'wiki' in file_path: m_filter_fn = large_file_filter_fn partition_size = 500 App_name = 'Part3-t1-large-partition-%d' % partition_size else: m_filter_fn = small_file_filter_fn partition_size = 5 App_name = 'Part3-t1-small-partition-%d' % partition_size conf = SparkConf().setAppName( App_name).setMaster(master).set("spark.local.dir", "/mnt/data/tmp/").set("spark.eventLog.enabled", "true").set("spark.driver.memory", "25g").set("spark.executor.memory", "25g").set("spark.executor.cores", "5").set("spark.tmp.dir", "/mnt/data/tmp/").set("spark.eventLog.dir", "file:///users/yunjia/spark_log/") sc = SparkContext(conf=conf) lines = sc.textFile(file_path).repartition(partition_size) print("Total partition: ", lines.getNumPartitions()) print("Tmp dir: ", conf.get('spark.tmp.dir')) rank = calculate_page_rank(lines, m_filter_fn) rank.saveAsTextFile(save_path)
from pyspark.mllib.linalg import Vectors from pyspark.sql import Row from pyspark import SparkConf,SparkContext from itertools import izip_longest from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.sql import SQLContext def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) conf = SparkConf() sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #Initialize all objects cos=CaffeOnSpark(sc,sqlContext) cmdargs = conf.get('spark.pythonargs') args= dict(grouper(cmdargs.split(),2)) cfg=Config(sc,args) dl_train_source = DataSource(sc).getSource(cfg,True) #Train cos.train(dl_train_source) lr_raw_source = DataSource(sc).getSource(cfg,False) #Extract features extracted_df = cos.features(lr_raw_source) # Do multiclass LogisticRegression data = extracted_df.map(lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1))) lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10) predictions = lr.predict(data.map(lambda pt : pt.features))
from pyspark import SparkConf, SparkContext from itertools import izip_longest from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.sql import SQLContext def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) conf = SparkConf() sc = SparkContext(conf=conf) #Initialize all objects cos = CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') args = dict(grouper(cmdargs.split(), 2)) cfg = Config(sc, args) dl_train_source = DataSource(sc).getSource(cfg, True) #Train cos.train(dl_train_source) lr_raw_source = DataSource(sc).getSource(cfg, False) #Extract features extracted_df = cos.features(lr_raw_source) # Do multiclass LogisticRegression data = extracted_df.map( lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1))) lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10) predictions = lr.predict(data.map(lambda pt: pt.features))
当前文件主要介绍SparkContext,扫描除了RDD class 核心组件 pyspark.SparkContext Main entry point for Spark functionality. pyspark.RDD A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. """ conf = SparkConf() conf.setAppName("appName") conf.setMaster("local") conf.set("key", "value") # setIfMissing setSparkHome print(conf.getAll()) print(conf.get('spark.app.name')) print(conf.contains('spark.app.name')) print(conf.toDebugString()) # show_profiles() conf.set('spark.python.profile', 'true') # Spark功能的主入口点。SparkContext表示到Spark集群的连接,可用于在该集群上创建RDD和广播变量。 # 每个JVM只有一个SparkContext是活动的。在创建新的SparkContext之前,必须停止()活动SparkContext。 # class pyspark.SparkContext(master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, gateway=None, jsc=None, profiler_cls=<class 'pyspark.profiler.BasicProfiler'>) # sc = SparkContext(conf=conf) print( "--------------------SparkConf结束---------SparkContext开始-------------------------------" ) sc = SparkContext(conf=conf)