def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk", extra_conf=None): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: spark master setting :param py_files: list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: override $SPARK_HOME :param sparktk_home: override $SPARKTK_HOME :param app_name: name of spark app :param extra_conf: dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} :return: pyspark SparkContext """ set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) if not master: master = default_spark_master logger.info("sparktk.create_sc() master not specified, setting to %s", master) conf = SparkConf().setMaster(master).setAppName(app_name) if extra_conf: for k, v in extra_conf.items(): conf = conf.set(k, v) if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register(shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join(["=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc
def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk"): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: spark master setting :param py_files: list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: override $SPARK_HOME :param sparktk_home: override $SPARKTK_HOME :param app_name: name of spark app :return: pyspark SparkContext """ set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) if not master: master = default_spark_master logger.info("sparktk.create_sc() master not specified, setting to %s", master) conf = SparkConf().setMaster(master).setAppName(app_name) if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register(shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join(["=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc
def __connected_spark_cluster(self, resource_url, pilot_description=None): conf = SparkConf() conf.setAppName("Pilot-Spark") if pilot_description != None: for i in pilot_description.keys(): if i.startswith("spark"): conf.set(i, pilot_description[i]) conf.setMaster(resource_url) print(conf.toDebugString()) sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
def __connected_spark_cluster(self, resource_url, pilot_description=None): conf = SparkConf() conf.setAppName("Pilot-Spark") if pilot_description!=None: for i in pilot_description.keys(): if i.startswith("spark"): conf.set(i, pilot_description[i]) conf.setMaster(resource_url) print(conf.toDebugString()) sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx) return pilot
if bHit == True: yield t if __name__ == '__main__': sApp = 'spark' nPart = 38 * 14 * 4 sRef = op.join(sHdfsDir, 'hg38.fa.nb.enc.gzip') #sRef = op.join(sHdfsDir, 'chr21.fa.nb.enc.gzip') sInput = op.join(sHdfsDir, 'half.enc') sSeeds = op.join(sHdfsDir, 'seed.enc') # print default SparkConf sf = SparkConf() print sf.toDebugString() sc = SparkContext(appName=sApp) capacity = 661512672 error_rate = 0.1 bf, num_slices, bits_per_slice, hashfn, seeds, fmt = calcBfPar( capacity, error_rate) with open('/hddscratch/half.enc.bf', 'r') as fOut: bf = ba.bitarray(endian='little') bf.fromstring(pk.load(fOut)) bcBitarray = sc.broadcast(bf.tostring()) bcNumSlices = sc.broadcast(num_slices) bcBitsPerSlice = sc.broadcast(bits_per_slice) bcHashfn = sc.broadcast(hashfn)
def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk", other_libs=None, extra_conf=None, use_local_fs=False, debug=None): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: (str) spark master setting; for ex. 'local[4]' or 'yarn-client' :param py_files: (list) list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: (str) override $SPARK_HOME, the location of spark :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk :param pyspark_submit_args: (str) extra args passed to the pyspark submit :param app_name: (str) name of spark app that will be created :param other_libs: (list) other libraries (actual packages/modules) that are compatible with spark-tk, which need to be added to the spark context. These libraries must be developed for usage with spark-tk and have particular methods implemented. (See sparkconf.py _validate_other_libs) :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started :return: pyspark SparkContext """ set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, other_libs, debug) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) if not master: master = default_spark_master logger.info("sparktk.create_sc() master not specified, setting to %s", master) conf = SparkConf().setMaster(master).setAppName(app_name) if extra_conf: for k, v in extra_conf.items(): conf = conf.set(k, v) if use_local_fs: conf.set("spark.hadoop.fs.default.name", "file:///") if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register(shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join(["=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc
while not q.empty() and need > 0: temp += [q.get()] need = need-1 newTemp = [word[0],temp] return newTemp if __name__ == "__main__": # if len(sys.argv) != 2: # print("Usage: bigram <file>", file=sys.stderr) # exit(-1) conf = SparkConf().set("spark.executor.memory", "10g")\ .set("driver-memory","10g")\ .set("spark.driver.maxResultSize","10g") print(conf.toDebugString()) sc = SparkContext(conf=conf) lines = sc.textFile(sys.argv[1], 1) sentences = lines.glom() \ .map(lambda x: " ".join(x)) \ .flatMap(lambda x: x.split(".")) words = sentences.map(lambda word: (word.encode('utf-8')))\ .map(lambda x : x.lower().split()) \ .flatMap(lambda x: ((x[i], x[i + 1], x[i+2], x[i+3]) for i in range(len(x) - 3))) \ .map(lambda x : (x, 1)).reduceByKey(lambda x, y: x + y).filter(lambda x: x[1]>5) last = words.map(lambda x: ((x[0][1],x[0][2],x[0][3]),(x[0][0],x[1]))).reduceByKey(lambda x, y: x+y).collect()
def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name="sparktk", extra_conf=None, use_local_fs=False, debug=None): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: spark master setting :param py_files: list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: override $SPARK_HOME :param sparktk_home: override $SPARKTK_HOME :param app_name: name of spark app :param extra_conf: dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} :param use_local_fs: simpler way to specify using local file system, rather than hdfs or other :param debug: provide an port address to attach a debugger to the JVM that gets started :return: pyspark SparkContext """ set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, debug) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) if not master: master = default_spark_master logger.info("sparktk.create_sc() master not specified, setting to %s", master) conf = SparkConf().setMaster(master).setAppName(app_name) if extra_conf: for k, v in extra_conf.items(): conf = conf.set(k, v) if use_local_fs: conf.set("spark.hadoop.fs.default.name", "file:///") if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register(shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join(["=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc
def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name=None, other_libs=None, extra_conf_file=None, extra_conf_dict=None, use_local_fs=False, debug=None): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: (str) spark master setting; for ex. 'local[4]' or 'yarn-client' :param py_files: (list) list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: (str) override $SPARK_HOME, the location of spark :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk :param pyspark_submit_args: (str) extra args passed to the pyspark submit :param app_name: (str) name of spark app that will be created :param other_libs: (list) other libraries (actual packages/modules) that are compatible with spark-tk, which need to be added to the spark context. These libraries must be developed for usage with spark-tk and have particular methods implemented. (See sparkconf.py _validate_other_libs) :param extra_conf_file: (str) local file path to a spark conf file to supplement the spark conf File format is basic key-value pairs per line, like: spark.executor.memory=6g spark.files.overwrite=true (NOTE: if env var $SPARKTK_EXTRA_CONF is set, the file it indicates will be used.) :param extra_conf_dict: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} these will override any matching settings from extra_conf_file, if provided :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started :return: pyspark SparkContext """ extra = {} if extra_conf_file: logger.info("create_sc() conf_file specified: %s" % extra_conf_file) extra = _parse_spark_conf(extra_conf_file) else: env_extra_conf_file = os.getenv('SPARKTK_EXTRA_CONF', None) if env_extra_conf_file: logger.info("create_sc() using env SPARKTK_EXTRA_CONF for extra conf file: %s" % env_extra_conf_file) extra = _parse_spark_conf(env_extra_conf_file) if extra_conf_dict: # extra_conf overrides settings in the conf_file logger.info("create_sc() overriding conf with given extra_conf_dict") extra.update(extra_conf_dict) master_in_extra = 'spark.master' in extra app_name_in_extra = 'spark.app.name' in extra if 'spark.driver.memory' in extra: pyspark_submit_args = "%s --driver-memory=%s" % (pyspark_submit_args or '', extra['spark.driver.memory']) set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, other_libs, debug) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) conf = SparkConf() # env must be set before creating SparkConf for k, v in extra.items(): conf.set(k, v) if not master and not master_in_extra: master = default_spark_master logger.info("create_sc() master not specified, setting to %s", master) if master: conf.setMaster(master) if not app_name and not app_name_in_extra: app_name = default_spark_app_name logger.info("create_sc() app_name not specified, setting to %s", app_name) if app_name: conf.setAppName(app_name) if use_local_fs: conf.set("spark.hadoop.fs.default.name", "file:///") if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register(shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join(["=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc
def create_sc(master=None, py_files=None, spark_home=None, sparktk_home=None, pyspark_submit_args=None, app_name=None, other_libs=None, extra_conf_file=None, extra_conf_dict=None, use_local_fs=False, debug=None): """ Creates a SparkContext with sparktk defaults Many parameters can be overwritten :param master: (str) spark master setting; for ex. 'local[4]' or 'yarn-client' :param py_files: (list) list of str of paths to python dependencies; Note the the current python package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed :param spark_home: (str) override $SPARK_HOME, the location of spark :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk :param pyspark_submit_args: (str) extra args passed to the pyspark submit :param app_name: (str) name of spark app that will be created :param other_libs: (list) other libraries (actual packages/modules) that are compatible with spark-tk, which need to be added to the spark context. These libraries must be developed for usage with spark-tk and have particular methods implemented. (See sparkconf.py _validate_other_libs) :param extra_conf_file: (str) local file path to a spark conf file to supplement the spark conf File format is basic key-value pairs per line, like: spark.executor.memory=6g spark.files.overwrite=true (NOTE: if env var $SPARKTK_EXTRA_CONF is set, the file it indicates will be used.) :param extra_conf_dict: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"} these will override any matching settings from extra_conf_file, if provided :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started :return: pyspark SparkContext """ extra = {} if extra_conf_file: logger.info("create_sc() conf_file specified: %s" % extra_conf_file) extra = _parse_spark_conf(extra_conf_file) else: env_extra_conf_file = os.getenv('SPARKTK_EXTRA_CONF', None) if env_extra_conf_file: logger.info( "create_sc() using env SPARKTK_EXTRA_CONF for extra conf file: %s" % env_extra_conf_file) extra = _parse_spark_conf(env_extra_conf_file) if extra_conf_dict: # extra_conf overrides settings in the conf_file logger.info("create_sc() overriding conf with given extra_conf_dict") extra.update(extra_conf_dict) master_in_extra = 'spark.master' in extra app_name_in_extra = 'spark.app.name' in extra if 'spark.driver.memory' in extra: pyspark_submit_args = "%s --driver-memory=%s" % ( pyspark_submit_args or '', extra['spark.driver.memory']) set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, other_libs, debug) # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell'])) conf = SparkConf() # env must be set before creating SparkConf for k, v in extra.items(): conf.set(k, v) if not master and not master_in_extra: master = default_spark_master logger.info("create_sc() master not specified, setting to %s", master) if master: conf.setMaster(master) if not app_name and not app_name_in_extra: app_name = default_spark_app_name logger.info("create_sc() app_name not specified, setting to %s", app_name) if app_name: conf.setAppName(app_name) if use_local_fs: conf.set("spark.hadoop.fs.default.name", "file:///") if not py_files: py_files = [] # zip up the relevant pieces of sparktk and put it in the py_files... path = zip_sparktk() tmp_dir = os.path.dirname(path) logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir) atexit.register( shutil.rmtree, tmp_dir) # make python delete this folder when it shuts down py_files.append(path) msg = '\n'.join([ "=" * 80, "Creating SparkContext with the following SparkConf", "pyFiles=%s" % str(py_files), conf.toDebugString(), "=" * 80 ]) logger.info(msg) sc = SparkContext(conf=conf, pyFiles=py_files) return sc