Example #1
0
    def _create_shell_session() -> "SparkSession":
        """
        Initialize a :class:`SparkSession` for a pyspark shell session. This is called from
        shell.py to make error handling simpler without needing to declare local variables in
        that script, which would expose those to users.
        """
        import py4j
        from pyspark.conf import SparkConf
        from pyspark.context import SparkContext
        try:
            # Try to access HiveConf, it will raise exception if Hive is not added
            conf = SparkConf()
            if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':
                (SparkContext._jvm  # type: ignore[attr-defined]
                 .org.apache.hadoop.hive.conf.HiveConf())
                return SparkSession.builder\
                    .enableHiveSupport()\
                    .getOrCreate()
            else:
                return SparkSession.builder.getOrCreate()
        except (py4j.protocol.Py4JError, TypeError):
            if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
                warnings.warn("Fall back to non-hive support because failing to access HiveConf, "
                              "please make sure you build spark with hive")

        return SparkSession.builder.getOrCreate()
Example #2
0
    def configure(self, opts, ports):
        """ Initializes Spark configuration object """

        # Check if there's already a conf variablex
        # If using SparkMonitor, this is defined but is of type SparkConf
        conf = self.connector.ipython.user_ns.get('swan_spark_conf')
        if conf:
            self.connector.log.warn("conf already exists: %s", conf.toDebugString())
            if not isinstance(conf, SparkConf):
                raise Exception('There is already a "swan_spark_conf" variable defined and is not of type SparkConf.')
        else:
            conf = SparkConf()  # Create a new conf

        options = self._parse_options(opts)

        # Do not overwrite the existing driver extraClassPath with option, add instead
        def_conf_extra_class_path = conf.get('spark.driver.extraClassPath', '')
        options_extra_class_path = options.get('spark.driver.extraClassPath', '')
        if def_conf_extra_class_path != '' and options_extra_class_path != '':
            options['spark.driver.extraClassPath'] = def_conf_extra_class_path + ":" + options_extra_class_path
        elif def_conf_extra_class_path != '' and options_extra_class_path == '':
            options['spark.driver.extraClassPath'] = def_conf_extra_class_path
        elif def_conf_extra_class_path == '' and options_extra_class_path != '':
            options['spark.driver.extraClassPath'] = options_extra_class_path

        # Add options to the default conf
        for name, value in options.items():
            conf.set(name, value)

        # Extend conf adding logging of log4j to java options
        base_extra_java_options = "-Dlog4j.configuration=file:%s" % self.connector.log4j_file
        extra_java_options = conf.get("spark.driver.extraJavaOptions")
        if extra_java_options:
            extra_java_options = base_extra_java_options + " " + extra_java_options
        else:
            extra_java_options = base_extra_java_options
        conf.set("spark.driver.extraJavaOptions", extra_java_options)

        # Extend conf ensuring that LD_LIBRARY_PATH on executors is the same as on the driver
        ld_library_path = conf.get('spark.executorEnv.LD_LIBRARY_PATH')
        if ld_library_path:
            ld_library_path = ld_library_path + ":" + os.environ.get('LD_LIBRARY_PATH', '')
        else:
            ld_library_path = os.environ.get('LD_LIBRARY_PATH', '')
        conf.set('spark.executorEnv.LD_LIBRARY_PATH', ld_library_path)

        # Extend conf with ports for the driver and block manager
        conf.set('spark.driver.host', os.environ.get('SERVER_HOSTNAME', 'localhost'))
        conf.set('spark.driver.port', ports[0])
        conf.set('spark.driver.blockManager.port', ports[1])
        conf.set('spark.port.maxRetries', 100)
        conf.set('spark.ui.port', ports[2])

        # Extend conf with spark app name to allow the monitoring and filtering of SWAN jobs in the Spark clusters
        app_name = conf.get('spark.app.name')
        conf.set('spark.app.name', app_name + '_swan' if app_name else 'pyspark_shell_swan')

        return conf
Example #3
0
    def __call__(self):
        log.info('Processing wiki dump: %s ...', self.wk_dump_path)
        c = SparkConf().setAppName('Wikijson')

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        if os.path.isdir(self.output_path):
            log.warn('Writing over output path: %s', self.output_path)
            shutil.rmtree(self.output_path)

        # rdd of tuples: (title, namespace, id, redirect, content)
        pages = wikispark.get_pages_from_wikidump(sc, self.wk_dump_path)
        pages.cache()

        articles = wikispark.get_articles_from_pages(pages)
        redirects = wikispark.get_redirects_from_pages(pages)

        if self.redirect_links:
            articles = wikispark.redirect_article_links(articles, redirects)

        articles.map(self.article_to_json)\
                .map(json.dumps)\
                .saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')

        log.info('Done.')
Example #4
0
    def __call__(self):
        log.info("Processing wiki dump: %s ...", self.wk_dump_path)
        c = SparkConf().setAppName("Wikijson")

        log.info("Using spark master: %s", c.get("spark.master"))
        sc = SparkContext(conf=c)

        if os.path.isdir(self.output_path):
            log.warn("Writing over output path: %s", self.output_path)
            shutil.rmtree(self.output_path)

        # rdd of tuples: (title, namespace, id, redirect, content)
        pages = wikispark.get_pages_from_wikidump(sc, self.wk_dump_path)
        pages.cache()

        articles = wikispark.get_articles_from_pages(pages)
        redirects = wikispark.get_redirects_from_pages(pages)

        if self.redirect_links:
            articles = wikispark.redirect_article_links(articles, redirects)

        articles.map(self.article_to_json).map(json.dumps).saveAsTextFile(
            self.output_path, "org.apache.hadoop.io.compress.GzipCodec"
        )

        log.info("Done.")
Example #5
0
def get_default_spark_conf(additional_conf=None):
    if additional_conf is None:
        additional_conf = {}
    conf = SparkConf(). \
        setAppName("pyunit-test"). \
        setMaster("local[*]"). \
        set("spark.driver.memory", "2g"). \
        set("spark.executor.memory", "2g"). \
        set("spark.ext.h2o.client.log.level", "DEBUG"). \
        set("spark.ext.h2o.repl.enabled", "false"). \
        set("spark.task.maxFailures", "1"). \
        set("spark.rpc.numRetries", "1"). \
        set("spark.deploy.maxExecutorRetries", "1"). \
        set("spark.network.timeout", "360s"). \
        set("spark.worker.timeout", "360"). \
        set("spark.ext.h2o.cloud.name", unique_cloud_name("test")). \
        set("spark.ext.h2o.external.start.mode", "auto"). \
        set("spark.ext.h2o.node.log.dir", "build/h2ologs-pyunit/workers"). \
        set("spark.ext.h2o.client.log.dir", "build/h2ologs-pyunit/client")

    for key in additional_conf:
        conf.set(key, additional_conf[key])

    if conf.get("spark.ext.h2o.backend.cluster.mode") == "external":
        conf.set("spark.ext.h2o.client.ip", local_ip())
        conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "1")

    return conf
Example #6
0
def get_default_spark_conf(additional_conf=None):
    if additional_conf is None:
        additional_conf = {}
    conf = SparkConf(). \
        setAppName("pyunit-test"). \
        setMaster("local[*]"). \
        set("spark.driver.memory", "2g"). \
        set("spark.executor.memory", "2g"). \
        set("spark.ext.h2o.client.log.level", "DEBUG"). \
        set("spark.ext.h2o.repl.enabled", "false"). \
        set("spark.task.maxFailures", "1"). \
        set("spark.rpc.numRetries", "1"). \
        set("spark.deploy.maxExecutorRetries", "1"). \
        set("spark.network.timeout", "360s"). \
        set("spark.worker.timeout", "360"). \
        set("spark.ext.h2o.cloud.name", unique_cloud_name("test")). \
        set("spark.ext.h2o.external.start.mode", "auto"). \
        set("spark.ext.h2o.node.log.dir", "build/h2ologs-pyunit/workers"). \
        set("spark.ext.h2o.client.log.dir", "build/h2ologs-pyunit/client")

    for key in additional_conf:
        conf.set(key, additional_conf[key])

    if conf.get("spark.ext.h2o.backend.cluster.mode") == "external":
        conf.set("spark.ext.h2o.client.ip", local_ip())
        conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "1")

    return conf
def main():
    conf = SparkConf()
    dateStr = conf.get('spark.date')
    sc = SparkContext(conf=conf, appName='Loc City Data Prepare, ' + dateStr)
    hc = HiveContext(sc)

    sqlDict = prepareSql(dateStr)
    #mergedRdd = sc.emptyRDD()
    mergedRdd = sc.parallelize([])
    for prod, sql in sqlDict.items():
        print sql
        df = hc.sql(sql)
        #print 'df count:', df.count()
        rdd = df.map(lambda x: toCityLoc(x, prod))
        rdd = rdd.filter(lambda x: x[0] is not None)
        rdd = rdd.map(lambda x: x[0])
        mergedRdd = mergedRdd.union(rdd)
        #break

    mergedRdd.cache()
    print 'mergedRdd count:', mergedRdd.count()
    fromRdd = mergedRdd.map(lambda cityLoc: (
        (cityLoc.area, cityLoc.fromPoi.displayName), (cityLoc.fromPoi, 1L)))
    toRdd = mergedRdd.map(lambda cityLoc: (
        (cityLoc.area, cityLoc.toPoi.displayName), (cityLoc.toPoi, 1L)))
    count(fromRdd, dateStr, 'from')
    count(toRdd, dateStr, 'to')
    print 'success'
    sc.stop()
Example #8
0
def main():
  conf = SparkConf()
  sc = SparkContext(conf=conf)
  sqlContext = SQLContext(sc)
  cmdargs = conf.get('spark.pythonargs')
  parser = argparse.ArgumentParser(description="Image to Caption Util")
  parser.add_argument('-input', action="store", dest="input")
  parser.add_argument('-model', action="store", dest="model")
  parser.add_argument('-imagenet', action="store", dest="imagenet")
  parser.add_argument('-lstmnet', action="store", dest="lstmnet")
  parser.add_argument('-vocab', action="store", dest="vocab")
  parser.add_argument('-output', action="store", dest="output")
  
  args=parser.parse_args(cmdargs.split(" "))

  df_input = sqlContext.read.parquet(str(args.input))
  images = df_input.select("data.image","data.height", "data.width", "id")
  df=get_predictions(sqlContext, images, str(args.model), str(args.imagenet), str(args.lstmnet), str(args.vocab))
  df.write.json(str(args.output))
Example #9
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if not is_spark_below_2_2():
        extend_spark_driver_cp(sparkConf, get_bigdl_classpath())

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get('PYSPARK_FILES', None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(key="spark.submit.pyFiles",
                          value="%s,%s" % (python_lib, existing_py_files))
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Example #10
0
    def __call__(self):
        c = SparkConf().setAppName("Build %s" % self.model_name)

        log.info("Using spark master: %s", c.get("spark.master"))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn("Writing over output path: %s", self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, "org.apache.hadoop.io.compress.GzipCodec")
        elif self.sample > 0:
            print "\n".join(str(i) for i in m.take(self.sample))

        log.info("Done.")
Example #11
0
def main():
    conf = SparkConf()
    dateStr = conf.get('spark.date')
    sc = SparkContext(appName='get sug poi of today from log table', conf=conf)
    hc = HiveContext(sc)

    sql = '''select log.param['json_str'] as json_str from pbs_dw.ods_log_ws_addrsuggestion as log
    where concat(year,month,day)=dateStr'''
    sql = sql.replace('dateStr', dateStr)
    print sql

    df = hc.sql(sql)
    rdd = df.flatMap(lambda x: format(x)).reduceByKey(
        lambda x, y: x, 200).map(lambda x: x[0] + '\t' + x[1])
    fPath = POI_DAILY_ROOT + '/' + 'poilist.' + dateStr
    print fPath
    print IoHelper.deleteFileInHDFS(fPath)
    rdd.saveAsTextFile(fPath)

    sc.stop()
    print 'over'
Example #12
0
def create_spark_conf():
    bigdl_conf = get_bigdl_conf()
    sparkConf = SparkConf()
    sparkConf.setAll(bigdl_conf.items())
    if os.environ.get("BIGDL_JARS", None) and not is_spark_below_2_2():
        for jar in os.environ["BIGDL_JARS"].split(":"):
            extend_spark_driver_cp(sparkConf, jar)

    # add content in PYSPARK_FILES in spark.submit.pyFiles
    # This is a workaround for current Spark on k8s
    python_lib = os.environ.get("PYSPARK_FILES", None)
    if python_lib:
        existing_py_files = sparkConf.get("spark.submit.pyFiles")
        if existing_py_files:
            sparkConf.set(
                key="spark.submit.pyFiles",
                value="%s,%s" % (python_lib, existing_py_files),
            )
        else:
            sparkConf.set(key="spark.submit.pyFiles", value=python_lib)

    return sparkConf
Example #13
0
    def __call__(self):
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path,
                             'org.apache.hadoop.io.compress.GzipCodec')
        elif self.sample > 0:
            print '\n'.join(str(i) for i in m.take(self.sample))

        log.info('Done.')
Example #14
0
    def __call__(self):
        log.info('Processing corpus: %s ...', self.corpus_path)
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        corpus = sc.textFile(self.corpus_path).map(json.loads)
        m = self.model.build(corpus)
        m = self.model.format(m)

        if self.sample > 0:
            if self.sort:
                m = m.map(lambda (k,v): (v,k)).sortByKey(False)
            print '\n'.join(str(i) for i in m.take(self.sample))
        elif self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')

        log.info('Done.')
Example #15
0
#==================================================

appName = "Kafka_to_HBase"
config = SparkConf().setAppName(appName)  

props = []
props.append(("spark.rememberDuration", "10"))
props.append(("spark.batchDuration", "10"))
props.append(("spark.eventLog.enabled", "true"))
props.append(("spark.streaming.timeout", "30"))
props.append(("spark.ui.enabled", "true"))

config = config.setAll(props)

sc = SparkContext(conf=config)  
ssc = StreamingContext(sc, int(config.get("spark.batchDuration")))

#==================================================
# Main application execution function
#==================================================

def runApplication(ssc, config):
  ssc.start()
  if config.get("spark.streaming.timeout") == '':
    ssc.awaitTermination()
  else:
    stopped = ssc.awaitTerminationOrTimeout(int(config.get("spark.streaming.timeout")))
  if not stopped :
    print("Stopping streaming context after timeout...")
    ssc.stop(True)
    print("Streaming context stopped.")
Example #16
0
    df_dataset = sqlContext.read.format("jdbc").options(
        url=url, driver=driver, dbtable=dbtable, user=user,
        password=password).load()

    return df_dataset


try:
    conf = SparkConf().setAppName("Spark_ETL")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    conf = ConfigParser.ConfigParser()
    conf.read("param.config")

    url = conf.get("MySQL", "url")
    driver = conf.get("MySQL", "driver")
    dbtable_A = conf.get("MySQL", "dbtable_A")
    dbtable_B = conf.get("MySQL", "dbtable_B")
    user = conf.get("MySQL", "user")
    password = conf.get("MySQL", "password")
    HiveSchema = conf.get("HiveSchema", "schema")

    print(url, driver, dbtable_A, dbtable_B, user, password)

    #check if we have received arguments as inputs to the script
    strWeek = sys.argv[1] if len(sys.argv) == 2 else "all_weeks"

    #logic to derive where clause based on input provided
    strSql = "" if strWeek == "all_weeks" else " where week = \'" + strWeek + "\'"
Example #17
0
                                 '\t' + res)
    return abnormal_features


if __name__ == "__main__":
    sparkConf = SparkConf()
    sparkConf.setAppName("dagang abnormal segment")
    sparkConf.set("spark.kryoserializer.buffer.max", "128")
    sc = SparkContext(conf=sparkConf)
    sc.setLogLevel("WARN")
    sqlCtx = HiveContext(sc)
    sqlCtx.setConf("spark.sql.parquet.binaryAsString", "true")
    sqlCtx.setConf("spark.sql.hive.convertMetastoreParquet", "true")
    sqlCtx.setConf("spark.sql.parquet.int96AsTimestamp", "true")

    executor_cores = int(sparkConf.get('spark.executor.cores'))
    num_executors = int(sparkConf.get('spark.executor.instances'))
    num_partitions = executor_cores * num_executors * 3

    features = [
        'area', 'down_oscillation', 'down_stroke', 'down_stroke_ratio',
        'down_stroke_zaihe', 'down_up_oscillation_ratio',
        'down_up_stroke_zaihe_ratio', 'down_up_zaihe_ratio', 'down_zaihe',
        'left_upper_area', 'left_upper_area_ratio', 'max_weiyi',
        'max_weiyi_zaihe', 'max_zaihe', 'min_max_zaihe_ratio', 'min_weiyi',
        'min_weiyi_zaihe', 'min_zaihe', 'up_oscillation', 'up_stroke',
        'up_stroke_ratio', 'up_stroke_zaihe', 'up_zaihe'
    ]

    print(current_timestamp(), '-' * 30 + 'starting')
    abnormal_sql = """
Example #18
0
import py4j

from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    conf = SparkConf()
    if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':
        SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
        spark = SparkSession.builder\
            .enableHiveSupport()\
            .getOrCreate()
    else:
        spark = SparkSession.builder.getOrCreate()
except py4j.protocol.Py4JError:
    if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
        warnings.warn(
            "Fall back to non-hive support because failing to access HiveConf, "
            "please make sure you build spark with hive")
    spark = SparkSession.builder.getOrCreate()
except TypeError:
    if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':
        warnings.warn(
from pyspark import SparkContext
from pyspark import SparkConf
import sys
from pyspark.rdd import RDD

def update_dictionary(map):
        map.update(test="alex")
        return map

if __name__ == "__main__":
        print("here1")
        conf = SparkConf()
        sc = SparkContext(appName="alex_test_app")
        print("here2")
        print("here2b: " + conf.get("spark.aleph2_job_config"))
        aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader().loadClass("com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils").newInstance().getAleph2(sc._jsc, sys.argv[1])
        print("here3")
        print aleph2.getRddInputNames()
        print("here4")
        #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count()
        print("here5")
        to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).map(lambda m: update_dictionary(m))
        aleph2.emitRdd(to_output._to_java_object_rdd())
    for row in f:
        row = row.strip().split(' ')
        vals = []
        for val in row:
            vals.append(float(val))

        ret.append(vals)
    return ret


### METADATA

conf = SparkConf().setAppName('GenerateTracerData')
sc = SparkContext(conf=conf)

ACCESS_KEY = conf.get('SPARK_ACCESS_KEY')
SECRET_KEY = conf.get('SPARK_SECRET_KEY')

s3 = s3fs.S3FileSystem(key=ACCESS_KEY, secret=SECRET_KEY)

subhalo_id = 89587
subhalo_positions = get_subhalo_pos(s3)
print('length subhalo positions')
print(len(subhalo_positions))
radius = 140

snaps = range(0, 4380)

last_snap = snaps[-1]
lastfiles = s3.glob(snap_tofile(last_snap))
blackhole_id, tracer_ids = gen_tracer_ids_blackhole(
Example #21
0
run from command line
spark-submit --master yarn-client --conf key=value --conf someotherkey=someothervalue you_code.py
"""

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row

conf = SparkConf().setAppName("hello-world").setMaster('yarn-client')
conf.set("spark.files.overwrite","true")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#log
log4jLogger = sc._jvm.org.apache.log4j
LOG = log4jLogger.LogManager.getLogger("hello.world.spark")
LOG.info("Args = " + conf.getAll().__str__())

inputFile = conf.get("spark.input")
outputFile = conf.get("spark.output")

wordcount = sc.textFile(inputFile).map(lambda line: line.replace("\"", " ").replace("{", " ").replace("}", " ").replace(".", " ").replace(":", " ")) \
    .flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda (k, v): (v, k)) \
    .sortByKey(ascending=False) \
    .map(lambda (k, v): (v, k))

df = wordcount.toDF(['word', 'count'])
df.save(path=outputFile, source='json', mode='overwrite')
		
		#similarly we can extract valid records set and persist data in respective table by extending ruleEngine definition
		#df_valid = df_full_data.where("reason_flag is null or log_level = 'warn'")
		
		#unpersisting df_full_data and rules dataframes
		df_full_data.unpersist()
		rules.unpersist()

if __name__ == "__main__":
	
	#initilizing SparkConf
	conf = SparkConf().setAppName('Streaming Rule Engine')
	
	#reading property file
	try:
		batch_time = int(conf.get('spark.ruleEngine.batchTimeInSec'))
	except ValueError:
		print 'ERROR: batch time must be integer value, found string'
		exit()
	file_dir_path = conf.get('spark.ruleEngine.streamingFilesDirath')
	rule_config_db_name = conf.get('spark.ruleEngine.ruleConfigDbName')
	rule_config_tbl_name = conf.get('spark.ruleEngine.ruleConfigTableName')
	invalid_rec_db_name = conf.get('spark.ruleEngine.invalidRecordDbName')
	invalid_rec_tbl_name = conf.get('spark.ruleEngine.invalidRecordTableName')
	
	#initilizing spark context, streaming context and hive enabled sql context
	sc = SparkContext(conf = conf)
	ssc = StreamingContext(sc, batch_time)
	sqlContext = HiveContext(sc)
	
	#reading files from input file dir (HDFS Loaction)
Example #23
0
class CommonSparkContext(object):
    __metaclass__ = Singleton

    def __init__(self):
        """
        Create a spark context.

        The spark configuration is taken from xframes/config.ini and from
        the values set in SparkInitContext.set() if this has been called.
        """

        # This is placed here because otherwise it causes an error when used in a spark slave.
        from pyspark import SparkConf, SparkContext, SQLContext, HiveContext

        # This reads from default.ini and then xframes/config.ini
        # if they exist.
        self._env = Environment.create()
        context = create_spark_config(self._env)
        verbose = self._env.get_config("xframes", "verbose", "false").lower() == "true"
        hdfs_user_name = self._env.get_config("webhdfs", "user", "hdfs")
        os.environ["HADOOP_USER_NAME"] = hdfs_user_name
        config_pairs = [(k, v) for k, v in context.iteritems()]
        self._config = SparkConf().setAll(config_pairs)
        if verbose:
            print "Spark Config: {}".format(config_pairs)

        self._sc = SparkContext(conf=self._config)
        self._sqlc = SQLContext(self._sc)
        self._hivec = HiveContext(self._sc)
        self.zip_path = []
        version = [int(n) for n in self._sc.version.split(".")]
        self.status_tracker = self._sc.statusTracker()
        if cmp(version, [1, 4, 1]) >= 0:
            self.application_id = self._sc.applicationId
        else:
            self.application_id = None

        if verbose:
            print "Spark Version: {}".format(self._sc.version)
            if self.application_id:
                print "Application Id: {}".format(self.application_id)

        if not context["spark.master"].startswith("local"):
            zip_path = self.build_zip(get_xframes_home())
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

        trace_flag = self._env.get_config("xframes", "rdd-trace", "false").lower() == "true"
        XRdd.set_trace(trace_flag)
        atexit.register(self.close_context)

    def spark_add_files(self, dirs):
        """
        Adds python files in the given directory or directories.

        Parameters
        ----------
        dirs: str or list(str)
            If a str, the pathname to a directory containing a python module.
            If a list, then it is a list of such directories.

            The python files in each directory are compiled, packed into a zip, distributed to each
            spark slave, and placed in PYTHONPATH.

            This is only done if spark is deployed on a cluster.
        """
        props = self.config()
        if props.get("spark.master", "local").startswith("local"):
            return
        if isinstance(dirs, basestring):
            dirs = [dirs]
        for path in dirs:
            zip_path = self.build_zip(path)
            if zip_path:
                self._sc.addPyFile(zip_path)
                self.zip_path.append(zip_path)

    def close_context(self):
        if self._sc:
            self._sc.stop()
            self._sc = None
            for zip_path in self.zip_path:
                os.remove(zip_path)

    def config(self):
        """
        Gets the configuration parameters used to initialize the spark context.

        Returns
        -------
        out : dict
            A dict of the properties used to initialize the spark context.
        """
        props = self._config.getAll()
        return {prop[0]: prop[1] for prop in props}

    def env(self):
        """
        Gets the config environment.

        Returns
        -------
        out : Environment
            The environment.  This contains all the values from the configuration file(s).
        """

        return self._env

    def sc(self):
        """
        Gets the spark context.

        Returns
        -------
        out : SparkContext
            The spark context.  There is a single spark context per process.
        """
        return self._sc

    def sqlc(self):
        """
        Gets the spark sql context.

        Returns
        -------
        out : sql.SqlContext
            The spark sql context.
        """
        return self._sqlc

    def hivec(self):
        """
        Gets the hive context.

        Returns
        -------
        out : sql.HiveContext
            The hive context.
        """
        return self._hivec

    def version(self):
        """
        Gets the spark version.

        Returns
        -------
        out: lst[int]
            The spark version, as a list of integers.
        """
        return [int(n) for n in self._sc.version.split(".")]

    def jobs(self):
        """
        Get the spark job ID and info for the active jobs.

        This method would normally be called by another thread from the executing job.

        Returns
        -------
        out: map(job_id: job_info}
            A map of the active job IDs and their corresponding job info
        """
        return {job_id: self.status_tracker.getJobInfo(job_id) for job_id in self.status_tracker.getActiveJobIds()}

    def cluster_mode(self):
        """
        Get the cluster mode of the spark cluster.

        Returns
        -------
        out: boolean
            True if spark is running in cluster mode.  Cluster mode means that spark is running on a platform separate
            the program.  In practice, cluster mode means that file arguments must be located on
            a network filesystem such as HDFS or NFS.
        """
        return not self._config.get("spark.master").startswith("local")

    # noinspection PyBroadException
    @staticmethod
    def build_zip(module_dir):
        # This can fail at writepy if there is something wrong with the files
        #  in xframes.  Go ahead anyway, but things will probably fail if this job is
        #  distributed
        try:
            tf = NamedTemporaryFile(suffix=".zip", delete=False)
            z = PyZipFile(tf, "w")
            z.writepy(module_dir)
            z.close()
            return tf.name
        except:
            logging.warn("Zip file distribution failed -- workers will not get xframes code.")
            logging.warn("Check for unexpected files in xframes directory.")
            return None

    @staticmethod
    def spark_context():
        """
        Returns the spark context.

        Returns
        -------
        out : pyspark.SparkContext
            The SparkContext object from spark.
        """
        return CommonSparkContext().sc()

    @staticmethod
    def spark_config():
        """
        Returns the spark cofig parameters.

        Returns
        -------
        out : list
            A list of the key-value pairs stored as tuples, used to initialize the spark context.
        """
        return CommonSparkContext().config()

    @staticmethod
    def spark_sql_context():
        """
        Returns the spark sql context.

        Returns
        -------
        out : pyspark.sql.SQLContext
            The SQLContext object from spark.
        """
        return CommonSparkContext().sqlc()

    @staticmethod
    def hive_context():
        """
        Returns the hive context.

        Returns
        -------
        out : pyspark.sql.HiveContext
            The Hive object from spark.
        """
        return CommonSparkContext().hivec()

    @staticmethod
    def spark_version():
        """
        Gets the spark version.

        Returns
        -------
        out: list[int]
            The spark version, as a list of integers.
        """
        return CommonSparkContext().version()

    @staticmethod
    def spark_cluster_mode():
        """
        Gets the cluster mode

        Returns
        -------
        out: boolean
            True if spark is running in cluster mode.  Cluster mode means that spark is running on a platform separate
            the program.  In practice, cluster mode means that file arguments must be located on
            a network filesystem such as HDFS or NFS.
        """
        env = Environment.create()
        config = create_spark_config(env)
        return not config.get("spark.master").startswith("local")
from pyspark import SparkContext
from pyspark import SparkConf
import sys
from pyspark.rdd import RDD


def update_dictionary(map):
    map.update(test="alex")
    return map


if __name__ == "__main__":
    print("here1")
    conf = SparkConf()
    sc = SparkContext(appName="alex_test_app")
    print("here2")
    print("here2b: " + conf.get("spark.aleph2_job_config"))
    aleph2 = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader(
    ).loadClass(
        "com.ikanow.aleph2.analytics.spark.utils.SparkPyTechnologyUtils"
    ).newInstance().getAleph2(sc._jsc, sys.argv[1])
    print("here3")
    print aleph2.getRddInputNames()
    print("here4")
    #print RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()), sc).count()
    print("here5")
    to_output = RDD(sc._jvm.SerDe.javaToPython(aleph2.getAllRddInputs()),
                    sc).map(lambda m: update_dictionary(m))
    aleph2.emitRdd(to_output._to_java_object_rdd())
Example #25
0
if __name__ == "__main__":
    argv = sys.argv
    master = argv[1]
    file_path = argv[2]
    save_path = argv[3]

    print("file_path = ", file_path)
    print("save_path = ", save_path)

    if 'xml' in file_path or 'wiki' in file_path:
        m_filter_fn = large_file_filter_fn
        partition_size = 500
        App_name = 'Part3-t1-large-partition-%d' % partition_size
    else:
        m_filter_fn = small_file_filter_fn
        partition_size = 5
        App_name = 'Part3-t1-small-partition-%d' % partition_size

    conf = SparkConf().setAppName(
        App_name).setMaster(master).set("spark.local.dir", "/mnt/data/tmp/").set("spark.eventLog.enabled", "true").set("spark.driver.memory", "25g").set("spark.executor.memory", "25g").set("spark.executor.cores", "5").set("spark.tmp.dir", "/mnt/data/tmp/").set("spark.eventLog.dir", "file:///users/yunjia/spark_log/")
    sc = SparkContext(conf=conf)

    lines = sc.textFile(file_path).repartition(partition_size)

    print("Total partition: ", lines.getNumPartitions())
    print("Tmp dir: ", conf.get('spark.tmp.dir'))

    rank = calculate_page_rank(lines, m_filter_fn)

    rank.saveAsTextFile(save_path)
from pyspark.mllib.linalg import Vectors
from pyspark.sql import Row
from pyspark import SparkConf,SparkContext
from itertools import izip_longest
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.sql import SQLContext

def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return izip_longest(fillvalue=fillvalue, *args)

conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
#Initialize all objects
cos=CaffeOnSpark(sc,sqlContext)
cmdargs = conf.get('spark.pythonargs')
args= dict(grouper(cmdargs.split(),2))
cfg=Config(sc,args)
dl_train_source = DataSource(sc).getSource(cfg,True)
#Train
cos.train(dl_train_source)
lr_raw_source = DataSource(sc).getSource(cfg,False)
#Extract features
extracted_df = cos.features(lr_raw_source)
# Do multiclass LogisticRegression
data = extracted_df.map(lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1)))
lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10)
predictions = lr.predict(data.map(lambda pt : pt.features))
Example #27
0
from pyspark import SparkConf, SparkContext
from itertools import izip_longest
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.sql import SQLContext


def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return izip_longest(fillvalue=fillvalue, *args)


conf = SparkConf()
sc = SparkContext(conf=conf)
#Initialize all objects
cos = CaffeOnSpark(sc)
cmdargs = conf.get('spark.pythonargs')
args = dict(grouper(cmdargs.split(), 2))
cfg = Config(sc, args)
dl_train_source = DataSource(sc).getSource(cfg, True)
#Train
cos.train(dl_train_source)
lr_raw_source = DataSource(sc).getSource(cfg, False)
#Extract features
extracted_df = cos.features(lr_raw_source)
# Do multiclass LogisticRegression
data = extracted_df.map(
    lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1)))
lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10)
predictions = lr.predict(data.map(lambda pt: pt.features))
Example #28
0
当前文件主要介绍SparkContext,扫描除了RDD class

核心组件
pyspark.SparkContext
Main entry point for Spark functionality.

pyspark.RDD
A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
"""

conf = SparkConf()
conf.setAppName("appName")
conf.setMaster("local")
conf.set("key", "value")  # setIfMissing setSparkHome
print(conf.getAll())
print(conf.get('spark.app.name'))
print(conf.contains('spark.app.name'))
print(conf.toDebugString())

# show_profiles()
conf.set('spark.python.profile', 'true')

# Spark功能的主入口点。SparkContext表示到Spark集群的连接,可用于在该集群上创建RDD和广播变量。
# 每个JVM只有一个SparkContext是活动的。在创建新的SparkContext之前,必须停止()活动SparkContext。
# class pyspark.SparkContext(master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, gateway=None, jsc=None, profiler_cls=<class 'pyspark.profiler.BasicProfiler'>)
# sc = SparkContext(conf=conf)
print(
    "--------------------SparkConf结束---------SparkContext开始-------------------------------"
)
sc = SparkContext(conf=conf)