コード例 #1
0
ファイル: sparkconf.py プロジェクト: AayushiD/spark-tk
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name="sparktk",
              extra_conf=None):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: spark master setting
    :param py_files: list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: override $SPARK_HOME
    :param sparktk_home: override $SPARKTK_HOME
    :param app_name: name of spark app
    :param extra_conf: dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
    :return: pyspark SparkContext
    """

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    if not master:
        master = default_spark_master
        logger.info("sparktk.create_sc() master not specified, setting to %s", master)

    conf = SparkConf().setMaster(master).setAppName(app_name)
    if extra_conf:
        for k, v in extra_conf.items():
            conf = conf.set(k, v)

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(shutil.rmtree, tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join(["=" * 80,
                     "Creating SparkContext with the following SparkConf",
                     "pyFiles=%s" % str(py_files),
                     conf.toDebugString(),
                     "=" * 80])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc
コード例 #2
0
ファイル: sparkconf.py プロジェクト: tlisonbee/spark-tk
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name="sparktk"):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: spark master setting
    :param py_files: list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: override $SPARK_HOME
    :param sparktk_home: override $SPARKTK_HOME
    :param app_name: name of spark app
    :return: pyspark SparkContext
    """

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    if not master:
        master = default_spark_master
        logger.info("sparktk.create_sc() master not specified, setting to %s", master)

    conf = SparkConf().setMaster(master).setAppName(app_name)

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(shutil.rmtree, tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join(["=" * 80,
                     "Creating SparkContext with the following SparkConf",
                     "pyFiles=%s" % str(py_files),
                     conf.toDebugString(),
                     "=" * 80])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc
コード例 #3
0
 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description != None:
         for i in pilot_description.keys():
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print(conf.toDebugString())
     sc = SparkContext(conf=conf)
     sqlCtx = SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot
コード例 #4
0
ファイル: __init__.py プロジェクト: drelu/SAGA-Hadoop
 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description!=None:
         for i in pilot_description.keys():
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print(conf.toDebugString())
     sc = SparkContext(conf=conf)
     sqlCtx = SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot
コード例 #5
0
        if bHit == True:
            yield t


if __name__ == '__main__':

    sApp = 'spark'
    nPart = 38 * 14 * 4
    sRef = op.join(sHdfsDir, 'hg38.fa.nb.enc.gzip')
    #sRef = op.join(sHdfsDir, 'chr21.fa.nb.enc.gzip')
    sInput = op.join(sHdfsDir, 'half.enc')
    sSeeds = op.join(sHdfsDir, 'seed.enc')

    # print default SparkConf
    sf = SparkConf()
    print sf.toDebugString()
    sc = SparkContext(appName=sApp)

    capacity = 661512672
    error_rate = 0.1
    bf, num_slices, bits_per_slice, hashfn, seeds, fmt = calcBfPar(
        capacity, error_rate)

    with open('/hddscratch/half.enc.bf', 'r') as fOut:
        bf = ba.bitarray(endian='little')
        bf.fromstring(pk.load(fOut))

    bcBitarray = sc.broadcast(bf.tostring())
    bcNumSlices = sc.broadcast(num_slices)
    bcBitsPerSlice = sc.broadcast(bits_per_slice)
    bcHashfn = sc.broadcast(hashfn)
コード例 #6
0
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name="sparktk",
              other_libs=None,
              extra_conf=None,
              use_local_fs=False,
              debug=None):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: (str) spark master setting; for ex. 'local[4]' or 'yarn-client'
    :param py_files: (list) list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: (str) override $SPARK_HOME, the location of spark
    :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
    :param pyspark_submit_args: (str) extra args passed to the pyspark submit
    :param app_name: (str) name of spark app that will be created
    :param other_libs: (list) other libraries (actual packages/modules) that are compatible with spark-tk,
                       which need to be added to the spark context.  These libraries must be developed for usage with
                       spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
    :param extra_conf: (dict) dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
    :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
    :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
    :return: pyspark SparkContext
    """

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, other_libs, debug)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    if not master:
        master = default_spark_master
        logger.info("sparktk.create_sc() master not specified, setting to %s", master)

    conf = SparkConf().setMaster(master).setAppName(app_name)
    if extra_conf:
        for k, v in extra_conf.items():
            conf = conf.set(k, v)

    if use_local_fs:
        conf.set("spark.hadoop.fs.default.name", "file:///")

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(shutil.rmtree, tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join(["=" * 80,
                     "Creating SparkContext with the following SparkConf",
                     "pyFiles=%s" % str(py_files),
                     conf.toDebugString(),
                     "=" * 80])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc
コード例 #7
0
ファイル: fourfirst30G.py プロジェクト: ZXShwan/WordCatch
    while not q.empty() and need > 0:
        temp += [q.get()]
        need = need-1

    newTemp = [word[0],temp]
    return newTemp


if __name__ == "__main__":
    # if len(sys.argv) != 2:
    #     print("Usage: bigram <file>", file=sys.stderr)
    #     exit(-1)
    conf = SparkConf().set("spark.executor.memory", "10g")\
             .set("driver-memory","10g")\
             .set("spark.driver.maxResultSize","10g")
    print(conf.toDebugString())

    sc = SparkContext(conf=conf)
    lines = sc.textFile(sys.argv[1], 1)

    sentences = lines.glom() \
                  .map(lambda x: " ".join(x)) \
                  .flatMap(lambda x: x.split(".")) 

    words = sentences.map(lambda word: (word.encode('utf-8')))\
                .map(lambda x : x.lower().split()) \
                .flatMap(lambda x: ((x[i], x[i + 1], x[i+2], x[i+3]) for i in range(len(x) - 3))) \
                .map(lambda x : (x, 1)).reduceByKey(lambda x, y: x + y).filter(lambda x: x[1]>5)
    

    last = words.map(lambda x: ((x[0][1],x[0][2],x[0][3]),(x[0][0],x[1]))).reduceByKey(lambda x, y: x+y).collect()
コード例 #8
0
ファイル: sparkconf.py プロジェクト: mapleNvg/spark-tk
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name="sparktk",
              extra_conf=None,
              use_local_fs=False,
              debug=None):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: spark master setting
    :param py_files: list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: override $SPARK_HOME
    :param sparktk_home: override $SPARKTK_HOME
    :param app_name: name of spark app
    :param extra_conf: dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
    :param use_local_fs: simpler way to specify using local file system, rather than hdfs or other
    :param debug: provide an port address to attach a debugger to the JVM that gets started
    :return: pyspark SparkContext
    """

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, debug)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    if not master:
        master = default_spark_master
        logger.info("sparktk.create_sc() master not specified, setting to %s", master)

    conf = SparkConf().setMaster(master).setAppName(app_name)
    if extra_conf:
        for k, v in extra_conf.items():
            conf = conf.set(k, v)

    if use_local_fs:
        conf.set("spark.hadoop.fs.default.name", "file:///")

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(shutil.rmtree, tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join(["=" * 80,
                     "Creating SparkContext with the following SparkConf",
                     "pyFiles=%s" % str(py_files),
                     conf.toDebugString(),
                     "=" * 80])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc
コード例 #9
0
ファイル: sparkconf.py プロジェクト: Haleyo/spark-tk
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name=None,
              other_libs=None,
              extra_conf_file=None,
              extra_conf_dict=None,
              use_local_fs=False,
              debug=None):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: (str) spark master setting; for ex. 'local[4]' or 'yarn-client'
    :param py_files: (list) list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: (str) override $SPARK_HOME, the location of spark
    :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
    :param pyspark_submit_args: (str) extra args passed to the pyspark submit
    :param app_name: (str) name of spark app that will be created
    :param other_libs: (list) other libraries (actual packages/modules) that are compatible with spark-tk,
                       which need to be added to the spark context.  These libraries must be developed for usage with
                       spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
    :param extra_conf_file: (str) local file path to a spark conf file to supplement the spark conf
                            File format is basic key-value pairs per line, like:

                                spark.executor.memory=6g
                                spark.files.overwrite=true

    (NOTE: if env var $SPARKTK_EXTRA_CONF is set, the file it indicates will be used.)

    :param extra_conf_dict: (dict) dict for any extra spark conf settings,
                            for ex. {"spark.hadoop.fs.default.name": "file:///"}
                            these will override any matching settings from extra_conf_file, if provided
    :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
    :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
    :return: pyspark SparkContext
    """

    extra = {}
    if extra_conf_file:
        logger.info("create_sc() conf_file specified: %s" % extra_conf_file)
        extra = _parse_spark_conf(extra_conf_file)
    else:
        env_extra_conf_file = os.getenv('SPARKTK_EXTRA_CONF', None)
        if env_extra_conf_file:
            logger.info("create_sc() using env SPARKTK_EXTRA_CONF for extra conf file: %s" % env_extra_conf_file)
            extra = _parse_spark_conf(env_extra_conf_file)

    if extra_conf_dict:
        # extra_conf overrides settings in the conf_file
        logger.info("create_sc() overriding conf with given extra_conf_dict")
        extra.update(extra_conf_dict)

    master_in_extra = 'spark.master' in extra
    app_name_in_extra = 'spark.app.name' in extra
    if 'spark.driver.memory' in extra:
        pyspark_submit_args = "%s --driver-memory=%s" % (pyspark_submit_args or '', extra['spark.driver.memory'])

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args, other_libs, debug)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    conf = SparkConf()  # env must be set before creating SparkConf
    for k, v in extra.items():
        conf.set(k, v)

    if not master and not master_in_extra:
        master = default_spark_master
        logger.info("create_sc() master not specified, setting to %s", master)
    if master:
        conf.setMaster(master)

    if not app_name and not app_name_in_extra:
        app_name = default_spark_app_name
        logger.info("create_sc() app_name not specified, setting to %s", app_name)
    if app_name:
        conf.setAppName(app_name)

    if use_local_fs:
        conf.set("spark.hadoop.fs.default.name", "file:///")

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(shutil.rmtree, tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join(["=" * 80,
                     "Creating SparkContext with the following SparkConf",
                     "pyFiles=%s" % str(py_files),
                     conf.toDebugString(),
                     "=" * 80])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc
コード例 #10
0
ファイル: sparkconf.py プロジェクト: lewisc/spark-tk-1
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name=None,
              other_libs=None,
              extra_conf_file=None,
              extra_conf_dict=None,
              use_local_fs=False,
              debug=None):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: (str) spark master setting; for ex. 'local[4]' or 'yarn-client'
    :param py_files: (list) list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: (str) override $SPARK_HOME, the location of spark
    :param sparktk_home: (str) override $SPARKTK_HOME, the location of spark-tk
    :param pyspark_submit_args: (str) extra args passed to the pyspark submit
    :param app_name: (str) name of spark app that will be created
    :param other_libs: (list) other libraries (actual packages/modules) that are compatible with spark-tk,
                       which need to be added to the spark context.  These libraries must be developed for usage with
                       spark-tk and have particular methods implemented.  (See sparkconf.py _validate_other_libs)
    :param extra_conf_file: (str) local file path to a spark conf file to supplement the spark conf
                            File format is basic key-value pairs per line, like:

                                spark.executor.memory=6g
                                spark.files.overwrite=true

    (NOTE: if env var $SPARKTK_EXTRA_CONF is set, the file it indicates will be used.)

    :param extra_conf_dict: (dict) dict for any extra spark conf settings,
                            for ex. {"spark.hadoop.fs.default.name": "file:///"}
                            these will override any matching settings from extra_conf_file, if provided
    :param use_local_fs: (bool) simpler way to specify using local file system, rather than hdfs or other
    :param debug: (int or str) provide an port address to attach a debugger to the JVM that gets started
    :return: pyspark SparkContext
    """

    extra = {}
    if extra_conf_file:
        logger.info("create_sc() conf_file specified: %s" % extra_conf_file)
        extra = _parse_spark_conf(extra_conf_file)
    else:
        env_extra_conf_file = os.getenv('SPARKTK_EXTRA_CONF', None)
        if env_extra_conf_file:
            logger.info(
                "create_sc() using env SPARKTK_EXTRA_CONF for extra conf file: %s"
                % env_extra_conf_file)
            extra = _parse_spark_conf(env_extra_conf_file)

    if extra_conf_dict:
        # extra_conf overrides settings in the conf_file
        logger.info("create_sc() overriding conf with given extra_conf_dict")
        extra.update(extra_conf_dict)

    master_in_extra = 'spark.master' in extra
    app_name_in_extra = 'spark.app.name' in extra
    if 'spark.driver.memory' in extra:
        pyspark_submit_args = "%s --driver-memory=%s" % (
            pyspark_submit_args or '', extra['spark.driver.memory'])

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args,
                        other_libs, debug)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS',
            ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    conf = SparkConf()  # env must be set before creating SparkConf
    for k, v in extra.items():
        conf.set(k, v)

    if not master and not master_in_extra:
        master = default_spark_master
        logger.info("create_sc() master not specified, setting to %s", master)
    if master:
        conf.setMaster(master)

    if not app_name and not app_name_in_extra:
        app_name = default_spark_app_name
        logger.info("create_sc() app_name not specified, setting to %s",
                    app_name)
    if app_name:
        conf.setAppName(app_name)

    if use_local_fs:
        conf.set("spark.hadoop.fs.default.name", "file:///")

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(
        shutil.rmtree,
        tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join([
        "=" * 80, "Creating SparkContext with the following SparkConf",
        "pyFiles=%s" % str(py_files),
        conf.toDebugString(), "=" * 80
    ])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc