Beispiel #1
0
def init_spark_context(details=[]):
    global spark_context
    if spark_context:
        return
    build_type = yb_dist_tests.global_conf.build_type
    from pyspark import SparkContext
    # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of
    # retries.
    # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark
    # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism
    #       is just for the resilience of the test framework itself.
    SparkContext.setSystemProperty('spark.task.maxFailures',
                                   str(SPARK_TASK_MAX_FAILURES))
    spark_master_url = os.environ.get('YB_SPARK_MASTER_URL',
                                      DEFAULT_SPARK_MASTER_URL)
    details += [
        'user: {}'.format(getpass.getuser()),
        'build type: {}'.format(build_type)
    ]

    if 'BUILD_URL' in os.environ:
        details.append('URL: {}'.format(os.environ['BUILD_URL']))

    spark_context = SparkContext(spark_master_url,
                                 "YB tests ({})".format(', '.join(details)))
    spark_context.addPyFile(yb_dist_tests.__file__)
def init_spark_context(details=[]):
    global spark_context
    if spark_context:
        return
    build_type = yb_dist_tests.global_conf.build_type
    from pyspark import SparkContext
    # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of
    # retries.
    # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark
    # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism
    #       is just for the resilience of the test framework itself.
    SparkContext.setSystemProperty('spark.task.maxFailures', str(SPARK_TASK_MAX_FAILURES))
    if yb_dist_tests.global_conf.build_type == 'tsan':
        logging.info("Using a separate default Spark cluster for TSAN tests")
        default_spark_master_url = DEFAULT_SPARK_MASTER_URL_TSAN
    else:
        logging.info("Using the regular default Spark cluster for non-TSAN tests")
        default_spark_master_url = DEFAULT_SPARK_MASTER_URL

    spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', default_spark_master_url)
    details += [
        'user: {}'.format(getpass.getuser()),
        'build type: {}'.format(build_type)
        ]

    if 'BUILD_URL' in os.environ:
        details.append('URL: {}'.format(os.environ['BUILD_URL']))

    spark_context = SparkContext(spark_master_url, "YB tests ({})".format(', '.join(details)))
    spark_context.addPyFile(yb_dist_tests.__file__)
Beispiel #3
0
def main(input, output, alpha, iters):
    SparkContext.setSystemProperty('spark.executor.memory', '3g')
    conf = SparkConf().setAppName("SparkPageRank")
    sc = SparkContext(conf=conf)

    lines = sc.textFile(input).filter(lambda x: len(x) and x[0] != '#')
    vertexes = lines.flatMap(lambda x: map(int, x.strip().split())) \
                    .distinct().map(lambda x: [x, None])

    edges = lines.map(lambda x: map(int, x.strip().split())).groupByKey()

    t = vertexes.leftOuterJoin(edges).map(lambda x: (x[0], x[1][1]))

    pr = vertexes.map(lambda x: (x[0], 1 / N))
    hang_pr = t.join(pr).filter(lambda x: x[1][0] is None).map(
        lambda x: x[1][1]).sum()

    for i in range(iters):
        emits = t.join(pr).flatMap(lambda x: mapper(x[0], x[1][1], x[1][0]))
        pr = emits.groupByKey().map(
            lambda x: reducer(x[0], x[1], hang_pr, alpha)).cache()
        hang_pr = t.join(pr).filter(lambda x: x[1][0] is None).map(
            lambda x: x[1][1]).sum()

    pr = pr.sortBy(lambda x: x[1], False)
    pr.saveAsTextFile(output)
Beispiel #4
0
def init_spark(verbose_logging=False, show_progress=False):
    if not show_progress:
        SparkContext.setSystemProperty('spark.ui.showConsoleProgress', 'false')
    sc = SparkContext()
    sqlContext = HiveContext(sc)
    if verbose_logging:
        sc.setLogLevel(
            'INFO' if isinstance(verbose_logging, bool) else verbose_logging)
    return sc, sqlContext
Beispiel #5
0
def main(*argv):

    #Setup to read from s3
    aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')

    conf = SparkConf().setAppName("taxishift1")
    SparkContext.setSystemProperty('spark.executor.memory', '5g')
    sc = SparkContext(conf=conf)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId",aws_access_key)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",aws_secret_access_key)

    #Read from s3
    for i in range(1,13):

        raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\
                                 'org.apache.hadoop.mapred.TextInputFormat',\
                                 'org.apache.hadoop.io.Text',\
                                 'org.apache.hadoop.io.LongWritable')

        #Call user defined function to map raw data into key-value pairs 
        new_data = raw_data.map(lambda x:splitit(x[1]))

        #Combine data from multiple csv files
        if i<2:
            total_data = new_data
        else:
            total_data = total_data.union(new_data)

        #Create total list of rides/shifts for each driver
        total_data = total_data.reduceByKey(lambda x,y:x+y)\
                               .mapValues(lambda x:mergeit(x))\
    
    #Create key-value for each shift, and set shifts greater than 10 hours to 10 hours (We want to know exactly when they go over)
    ungrouped_data = total_data.flatMap(lambda x: [(x[0],r) for r in x[1]])\
                               .mapValues(lambda x: x if x[1]-x[0] < 36000 else [x[0],x[0]+36000]) 

    #Extract only 10 hour shift offenders
    offenders = ungrouped_data.filter(lambda x:x[1][1] - x[1][0] >= 36000)\
                              .map(lambda x:(x[1][1]))\
                              .collect()

    #Plot number of offenders for every 30 minutes
    offender_hist = plt.hist(offenders, bins=range(int(min(offenders)), int(max(offenders)) + 1800, 1800))

    hist_min = int(min(offender_hist))
    hist_min+=900

    #Save to file
    hist_csv  = open('/home/ubuntu/offenders.csv','w')

    for i in offender_hist[0]:
        hist_csv.write(str(hist_min)+','+str(i)+'\n')
        hist_min+=1800
    hist_csv.close()

    return
Beispiel #6
0
 def initialize_params(self, partitions=2100, cores=5, memory=11):
     conf = SparkConf()
     conf.set('spark.sql.shuffle.partitions', str(partitions))
     conf.set("spark.executor.cores", str(cores))
     SparkContext.setSystemProperty('spark.executor.memory',
                                    str(memory) + 'g')
     SparkContext.setSystemProperty('spark.driver.memory',
                                    str(memory) + 'g')
     self.sc = SparkContext(appName='mm_exp', conf=conf)
     self.sqlContext = pyspark.SQLContext(self.sc)
Beispiel #7
0
def main():
    if len(sys.argv) < 3:
        print("Usage: input <file> mapping <file>", file=sys.stderr)
        exit(-1)

    time1 = datetime.datetime.fromtimestamp(time.time())
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH/lib/spark"
    usage = "usage: run_index_field_extractor.py [options]"
    #       --subapp data_load --op0 source_db --op1 source_table --op2 target_db --op3 target_table --op4 partitoins_column=date --op5 10
    global return_code, listOfPartitions, final_properties, sourceDB, sourceTable, targetDB, targetTable, partitonColumn, partitonColumnDataType, numberOfPartitins, app, sub_app, env, env_ver, group, common_properties, minPartition, maxPartition, start_line
    sourceFile, mappingFile, outputDir, lookupTable = arg_handle()
    #
    SparkContext.setSystemProperty(
        'spark.serializer', 'org.apache.spark.serializer.KryoSerializer')

    conf = SparkConf().setAppName("CrissIndexExtractor")
    sc = SparkContext(conf=conf)

    #sc = SparkContext(appName="IndexFieldExtractor");
    lines = sc.textFile(sourceFile, 1)
    # Filter header records
    headerList = lines.filter(lambda x: "HDR:" in x)
    # Filter data records
    dataFilter = lines.filter(lambda data: "DTL" in data)
    # dataFilter.cache()

    referenceMappedData = dataFilter.map(processDataLine(headerList.collect()))

    # get required index mapping of each record in
    indexMappingLines = sc.textFile(mappingFile, 1)
    indexFieldList = list(
        itertools.chain.from_iterable(
            indexMappingLines.map(createIndexColumnList).collect()))

    # print("IndexmappingList  " , indexFieldList)

    # groupDataby reference Key

    referenceGroupedData = referenceMappedData.groupByKey().map(
        lambda x: processReferenceData(x[0], list(x[1]), indexFieldList))
    # print("referenceMappedData" , referenceGroupedData.collect())

    referenceGroupedData.saveAsTextFile(outputDir)

    formidReferenceList = dataFilter.map(getReferenceFormidList)
    formidReferenceList.saveAsTextFile(lookupTable)

    # print(createTableStatement(indexFieldList))
    sc.stop()
    time2 = datetime.datetime.fromtimestamp(time.time())
    print("time taken", time2 - time1)

    sys.exit()
Beispiel #8
0
    def getSpark(self):
        SparkContext.setSystemProperty("hive.metastore.uris", "thrift://localhost:9083")

        spark = SparkSession \
                        .builder \
                        .appName('example-pyspark-read-and-write-from-hive') \
                        .config("spark.sql.warehouse.dir", "spark_warehouse") \
                        .enableHiveSupport() \
                        .getOrCreate()
        #spark.conf.get("spark.sql.hive.metastore.version")
        spark.sql("SET spark.sql.hive.metastore.version=2.3.2").show()

        #spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()
        return spark
def context_spark():
	conf = (SparkConf().setMaster("local[4]").set("spark.executor.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true").set("spark.driver.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true"))
	sc = SparkContext(conf=conf)
	sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")
	
	sql = SQLContext(sc)
	
	hadoopConf = sc._jsc.hadoopConfiguration()
	hadoopConf.set("fs.s3a.awsAccessKeyId", "---")
	hadoopConf.set("fs.s3a.awsSecretAccessKey", "---")
	hadoopConf.set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")
	hadoopConf.set("com.amazonaws.services.s3a.enableV4", "true")
	hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
	print(sc._conf.getAll())
	return sql
def create_spark_session(app_name="SparkApplication"):

    memory = '1g'
    pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
    os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args
    SparkContext.setSystemProperty('spark.executor.memory', '1g')
    # SparkContext.setSystemProperty('spark.driver.maxResultSize', '25g')

    spark_session = SparkSession.builder \
        .appName(app_name) \
        .master("local[*]") \
        .getOrCreate()

    spark_session.sparkContext.setLogLevel("WARN")
    return spark_session
Beispiel #11
0
def main(*argv):

    #Setup to read from s3
    aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')

    conf = SparkConf().setAppName("taxishift2")
    SparkContext.setSystemProperty('spark.executor.memory', '5g')
    sc = SparkContext(conf=conf)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_access_key)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",
                                      aws_secret_access_key)

    #Check for new data every five minutes
    ssc = StreamingContext(sc, 300)
    ssc.checkpoint('s3a://ddrum-s3/checkpoint/')

    #Update state with new data
    def updateFunc(currentTaxi, taxiState):
        if taxiState is None:
            taxiState = []

        #Create total list of rides/shifts for each driver
        return mergeit(currentTaxi + taxiState)

    #Read streaming from s3
    raw_data = ssc.textFileStream('s3a://ddrum-s3/trip_data/')

    #Call user defined function to map raw data into key-value pairs, and update state
    total_data = raw_data.map(lambda x:splitit(x))\
                         .updateStateByKey(updateFunc)

    #Create key-value for each shift, and return people with nine hour shifts or greater within 30 minutes of zerohour
    warn_list = total_data.flatMap(lambda x: [(x[0],r) for r in x[1]])\
                          .filter(lambda x:x[1][1] - x[1][0] > 32400)\
                          .filter(lambda x:x[1][1] < (zerohour + 1800) and x[1][1] > zerohour)

    #Indicator warning
    warn_list.pprint()

    #Begin stream
    ssc.start()
    ssc.awaitTermination()

    return
def spark_write_to_db(source, table_name, db_user, db_psrwd, db_endpoint,
                      aws_access_key_id, aws_secret_access_key):
    """
    Dumb copy of source data (parquet) into DB table using spark. 
    
    :params source (str) - s3 or local path to parquet file
    :params table_name (str) - name of table in planetpulse postgresql db
    :params db_user (str) - username for db access
    :params db_pswrd (str) - password for db access
    :params db_endpoint (str) - endpoint of planetpulse postgresql db
    """
    # Again, This is a work-around since we are running spark locally on EC2...
    # TODO - this is a similar to spark setup in intake/spark_etl.py - let's refactor to use shared utils code...
    # We should also be able to remove some of this when we move from PythonOperator -> SparkSubmitOperator
    os.environ[
        'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-common:3.0.0,org.apache.hadoop:hadoop-aws:3.0.0,org.apache.hadoop:hadoop-client:3.0.0 pyspark-shell'
    os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
    os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
    jars_path = pkg_resources.resource_filename('intake.jars',
                                                'postgresql-42.2.23.jar')
    conf = SparkConf().set('spark.jars', jars_path)
    sc = SparkContext(conf=conf)
    sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')
    hadoopConf = sc._jsc.hadoopConfiguration()
    hadoopConf.set("fs.s3a.awsAccessKeyId", aws_access_key_id)
    hadoopConf.set("fs.s3a.awsSecretAccessKey", aws_secret_access_key)
    hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com")
    hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    spark = SparkSession(sc)

    # Always overwrite data. We are always processing all data from
    # source, rather than just new data. No need to load current db data
    # and decide what to write.
    mode = 'overwrite'
    properties = {
        "user": db_user,
        "password": db_psrwd,
        "driver": "org.postgresql.Driver"
    }
    df = spark.read.parquet(source)
    df.write.jdbc(url=db_endpoint,
                  table=table_name,
                  mode=mode,
                  properties=properties)
Beispiel #13
0
def main(*argv):

    #Setup to read from s3
    aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default')

    conf = SparkConf().setAppName("taxishift3")
    SparkContext.setSystemProperty('spark.executor.memory', '5g')
    sc = SparkContext(conf=conf)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_access_key)
    sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",
                                      aws_secret_access_key)

    #Read from s3
    for i in range(1, 13):

        raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\
                                 'org.apache.hadoop.mapred.TextInputFormat',\
                                 'org.apache.hadoop.io.Text',\
                                 'org.apache.hadoop.io.LongWritable')

        #Call user defined function to map raw data into key-value pairs
        new_data = raw_data.map(lambda x: splitit(x[1]))

        #Combine data from multiple csv files
        if i < 2:
            total_data = new_data
        else:
            total_data = total_data.union(new_data)

    #Create total list of pickup chains and pickups for each coordinate
    #If there are more than three times as many pickups as pickup chains, this might be a taxistand
    total_data = total_data.reduceByKey(lambda x,y:x+y)\
                           .mapValues(lambda x:mergeit(x))\
                           .filter(lambda x:x[1][1]/x[1][0] > 3)\
                           .collect()

    #Print results
    total_data.sort()
    for i in total_data:
        print i

    return
Beispiel #14
0
def spark_run_etl(source,
                  output_path,
                  aws_access_key_id='',
                  aws_secret_access_key='',
                  local=False):
    """
    Run ETL from Source and output to Parquet

    :params source (str) - source from intake/sources
    :params output_path (str) - output path to s3 or local file system
    :params aws_access_key_id (str) - defaults to None
    :params aws_secret_access_key (str) - defaults to None
    :params local (bool) - Run pipeline for AWS or Local (s3 or local file system)
    """
    # This is a work-around since we are running spark locally on EC2
    # If we were running on a hadoop cluster, we could bypass this...
    # Unfortunately, we are cheap and spend too much of our money on
    # NYC rent and street food...
    if not local:
        os.environ[
            'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-common:3.0.0,org.apache.hadoop:hadoop-aws:3.0.0,org.apache.hadoop:hadoop-client:3.0.0 pyspark-shell'
        # Seems like we need to export env vars, too. Another
        # hacky workaround that will stick for now...
        os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
        os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key
        print(f"ACCESS KEY SECRET: {aws_secret_access_key}")
        sc = SparkContext()
        sc.setSystemProperty('com.amazonaws.services.s3.enableV4', 'true')
        hadoopConf = sc._jsc.hadoopConfiguration()
        hadoopConf.set("fs.s3a.awsAccessKeyId", aws_access_key_id)
        hadoopConf.set("fs.s3a.awsSecretAccessKey", aws_secret_access_key)
        hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com")
        hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')

    else:
        sc = SparkContext()

    spark = SparkSession(sc)
    print(f'Reading from {source}!')
    print(f'Writing to {output_path}')
    run_etl(source, output_path, spark=spark)
Beispiel #15
0
def init_spark_context(details=[]):
    global spark_context
    if spark_context:
        return
    build_type = yb_dist_tests.global_conf.build_type
    from pyspark import SparkContext
    # We sometimes fail tasks due to unsynchronized clocks, so we should tolerate a fair number of
    # retries.
    # https://stackoverflow.com/questions/26260006/are-failed-tasks-resubmitted-in-apache-spark
    # NOTE: we never retry failed tests to avoid hiding bugs. This failure tolerance mechanism
    #       is just for the resilience of the test framework itself.
    SparkContext.setSystemProperty('spark.task.maxFailures',
                                   str(SPARK_TASK_MAX_FAILURES))

    spark_master_url = g_spark_master_url_override
    if spark_master_url is None:
        if is_macos():
            logging.info("This is macOS, using the macOS Spark cluster")
            spark_master_url = SPARK_URLS['macos']
        elif yb_dist_tests.global_conf.build_type in ['asan', 'tsan']:
            logging.info(
                "Using a separate Spark cluster for ASAN and TSAN tests")
            spark_master_url = SPARK_URLS['linux_asan_tsan']
        else:
            logging.info(
                "Using the regular Spark cluster for non-ASAN/TSAN tests")
            spark_master_url = SPARK_URLS['linux_default']

    logging.info("Spark master URL: %s", spark_master_url)
    spark_master_url = os.environ.get('YB_SPARK_MASTER_URL', spark_master_url)
    details += [
        'user: {}'.format(getpass.getuser()),
        'build type: {}'.format(build_type)
    ]

    if 'BUILD_URL' in os.environ:
        details.append('URL: {}'.format(os.environ['BUILD_URL']))

    spark_context = SparkContext(spark_master_url,
                                 "YB tests ({})".format(', '.join(details)))
    spark_context.addPyFile(yb_dist_tests.__file__)
def main(test_file, model_file, output_file):
    SparkContext.setSystemProperty('spark.executor.memory', '4g')
    SparkContext.setSystemProperty('spark.driver.memory', '4g')
    sc = SparkContext.getOrCreate()
    start = time.time()

    sc.broadcast(sc.textFile(model_file,42).saveAsTextFile('task2_model'))
    model = sc.broadcast(sc.textFile('task2_model',42).map(json.loads).collectAsMap())
    shutil.rmtree('task2_model')

    data = sc.textFile(test_file).map(json.loads)
    profiles = data.map(lambda d: predict(d, model)).filter(lambda d: d["sim"] >= 0.01).map(json.dumps)

    json_string = profiles.reduce(lambda x, y: x + "\n" + y)

    # write your string to a file
    with open(output_file, "w") as f:
        f.write(json_string.encode("utf-8"))
    f.close()

    print("Duration:",time.time()-start)
def main(train_file, model_file, stopwords_file):
    SparkContext.setSystemProperty('spark.executor.memory', '4g')
    SparkContext.setSystemProperty('spark.driver.memory', '4g')
    sc = SparkContext.getOrCreate()
    start = time.time()
    stopwords = {s for s in sc.textFile(stopwords_file).collect()}

    reviews = (sc.textFile(train_file).map(
        json.loads).map(lambda d: tokenize(d, stopwords)).persist(
            StorageLevel(True, True, False, False)))

    n = reviews.count()

    #calculating number of documents the term appears in
    dfs = (reviews.flatMap(lambda d: d["tokens"]).map(
        lambda t: (t, 1)).reduceByKey(add).collectAsMap())

    idfs = {k: math.log(n / v) for k, v in dfs.items()}

    def add_key_prefix(rdd, prefix):
        return rdd.map(lambda x: ("{}_{}".format(prefix, x[0]), x[1]))

    business_profiles = build_profile(reviews, "business_id", idfs)

    user_profiles = (reviews.map(lambda d: (d["business_id"], d["user_id"])).
                     join(business_profiles).values().aggregateByKey({}, merge,
                                                                     merge))

    add_key_prefix(user_profiles, "u").union(
        add_key_prefix(business_profiles,
                       "b")).map(json.dumps).saveAsTextFile('task2_model')

    with open(model_file, 'wb') as outfile:
        for filename in glob.glob('task2_model/part*'):
            with open(filename, 'rb') as readfile:
                shutil.copyfileobj(readfile, outfile)

    shutil.rmtree('task2_model')

    print("Duration:", time.time() - start)
Beispiel #18
0
def create_spark_sql_context(app_name):
    """
    Instantiates spark and sql contexts.
    If executed twice, it will return the first instance.

    :param app_name: name of the app to assign to the created spark context.
    :return:
    """
    # Initialize Spark and SQL context

    # set spark config and master
    conf = copy(spark_conf)
    conf.setMaster(spark_master).setAppName(app_name)

    # set PROFILE environment in executors
    conf.setExecutorEnv('PROFILE', os.environ.get('PROFILE'))

    # set spark system properties
    for k, v in spark_sys_properties.items():
        SparkContext.setSystemProperty(k, v)

    sc = SparkContext(conf=conf)

    if sc is None:
        raise SparkClusterException("Unable to instantiate SparkContext")

    # Adding spark_helpers.zip to SparkContext so that workers can load modules from spark_helpers
    # http://apache-spark-user-list.1001560.n3.nabble.com/Loading-Python-libraries-into-Spark-td7059.html
    tmp_dir = tempfile.mkdtemp()
    sc.addPyFile(
        shutil.make_archive(base_name='{}/spark_cluster_pkg'.format(tmp_dir), format='zip',
                            root_dir=os.path.abspath(path_pkg)))

    sq = SQLContext(sc)

    if sq is None:
        raise SparkClusterException("Unable to instantiate SQLContext")

    return sc, sq
Beispiel #19
0
def setup_spark_connection():
    """
    Set up Spark connection

    @input
    None
    @output
    sqlconn(a HiveContext object) --
    """

    # check Spark Expiration date

    SparkContext.setSystemProperty("hive.metastore.uris",
                                   "thrift://z9awsspsyn2m52.celgene.com:9083")

    # set up connection
    sparkconf = SparkConf().setAppName("upload_data")
    sc = SparkContext(conf=sparkconf)
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)
    sqlconn = HiveContext(sc)

    return (sqlconn)
Beispiel #20
0
def get_sc(row_id_str,sp_master, exe_memory, core_max):
    
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    from pyspark.mllib.util import MLUtils
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.classification import SVMWithSGD, SVMModel
    from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionWithSGD, LogisticRegressionModel
    from pyspark.mllib.linalg import SparseVector
    from pyspark.mllib.evaluation import BinaryClassificationMetrics
    from pyspark.mllib.tree import DecisionTree
    from pyspark.mllib.clustering import KMeans, KMeansModel, GaussianMixture, GaussianMixtureModel
    from pyspark.mllib.linalg import Vectors 
    
    SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress'))
    SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
    SparkContext.setSystemProperty('spark.executor.memory', exe_memory)
    SparkContext.setSystemProperty('spark.cores.max', core_max)

    sc = SparkContext(sp_master, 'single_predict:'+row_id_str)
    
    return sc
def main():
    # parse arguments
    print "INFO: creating parquet ..."
    args = read_args(sys.argv[1:])

    SparkContext.setSystemProperty('spark.rdd.compress',
                                   config.get('spark', 'spark_rdd_compress'))
    SparkContext.setSystemProperty(
        'spark.driver.maxResultSize',
        config.get('spark', 'spark_driver_maxResultSize'))
    SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
    SparkContext.setSystemProperty('spark.cores.max', args.core_max)
    sc = SparkContext(args.sp_master, 'parquet_creator:' + str(args.row_id))
    sqlCtx = SQLContext(sc)

    # load json obj from file to srdd
    ifname = args.hd_master + os.path.join(args.src_dir, args.src_files)

    df = sqlCtx.read.json(ifname)

    out_fname = args.hd_master + os.path.join(args.src_dir, args.out_dir)
    print "INFO: out_dir=" + args.out_dir
    # clean up existing hdfs file
    try:
        hdfs.rmr(out_fname)
    except:
        e = sys.exc_info()[0]
        print "WARNING: ", e

    # convert dataframe format
    print "INFO: SQL=", args.sql_script
    df.registerTempTable(args.tblname)
    df2 = sqlCtx.sql(args.sql_script)
    # save as parquet
    df2.write.parquet(out_fname)

    df2.printSchema()
from pyspark import SparkConf, SparkContext
SparkContext.setSystemProperty("hadoop.home.dir", "C:\\spark-1.5.1-bin-hadoop2.6\\")
import sys, pickle,math
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('random-forest')
sc = SparkContext(conf=conf)

input = sys.argv[1]

# Load and parse the data
def parsePoint(line):
    return LabeledPoint(float(line[1]), line[0])

train = sc.pickleFile(input+'/bow_train/part-00000')
test = sc.pickleFile(input+'/bow_test/part-00000')
parsedtrain=train.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0)
parsedtest = test.map(parsePoint).filter(lambda line:len(line.features)!=0 or len(line.label)!=0).cache()
model = GradientBoostedTrees.trainRegressor(parsedtrain,categoricalFeaturesInfo={}, numIterations=1)
predictions = model.predict(parsedtest.map(lambda x: x.features))
labelsAndPredictions = parsedtest.map(lambda lp: lp.label).zip(predictions)
val_err = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(parsedtest.count())
parsedtest.unpersist()
RMSE=math.sqrt(val_err)

print("Root Mean Squared Error Test= " + str(RMSE))

Beispiel #23
0
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator


def distance(long1, lat1, long2, lat2):
    radius = 6371
    diff_lat = radians(lat2 - lat1)
    diff_long = radians(long2 - long1)
    a = sin(diff_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(diff_long / 2)**2
    c = 2 * atan2(a**0.5, (1 - a)**0.5)
    return radius * c


if __name__ == "__main__":
    SparkContext.setSystemProperty("saprk.executor.memory", "12g")
    spark = SparkSession.builder.appName("RegressionTree").getOrCreate()

    # Load up data as dataframe
    data = spark.read.csv(
        "/Users/rafaelchen/Documents/MapReduce/hw2 decision tree/src/train.csv",
        header=True)
    # Data preprocessing
    data = data.withColumn(
        "pickup_longitude", data["pickup_longitude"].cast("float")).withColumn(
            "pickup_latitude",
            data["pickup_latitude"].cast("float")).withColumn(
                "dropoff_longitude",
                data["dropoff_longitude"].cast("float")).withColumn(
                    "dropoff_latitude",
                    data["dropoff_latitude"].cast("float")).withColumn(
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType, LongType
import couchdb
import requests
import json
import ast

SparkContext.setSystemProperty('spark.driver.memory', '1g')
SparkContext.setSystemProperty('spark.driver.cores', '3')
SparkContext.setSystemProperty('spark.executor.memory', '1g')
SparkContext.setSystemProperty('spark.executor.cores', '3')
SparkContext.setSystemProperty('spark.driver.memoryOverhead', '1g')
SparkContext.setSystemProperty('spark.storage.memoryFraction', '0.9')
SparkContext.setSystemProperty('spark.sql.codegen.wholeStage', 'false')

spark = SparkSession.builder.appName("Movies: DATA PREP")\
                            .config("dfs.client.read.shortcircuit.skip.checksum", "true")\
                            .getOrCreate()

print("##################################### LOADING MOVIES DATASET")
### LOAD MOVIES DATASET
movies_df = spark.read.csv('ml-latest/movies.csv',header=True,inferSchema=True) \
    .select('movieId', 'genres').rdd.flatMapValues(lambda x:x.split("|")) \
    .toDF(['IdMovie', 'genres'])

print("##################################### GATHERING DATA FROM COUCHDB")
response = requests.get(
    "http://SERVER:5984/moviesdetails/_all_docs?include_docs=true")
y = [response.text][0]
Beispiel #25
0
    # >>> excute_sql = job_day(date= job_date,module_sql= module_sql)
    # >>> print("excute_sql:",excute_sql)
    "excute_sql":  " select * from  db.table where day between '2019-03-01' and '2019-03-31' "
    """

    date_start = date.strftime('%Y-%m-%d')
    monthrange = calendar.monthrange(date.year, date.month)[1]
    date_end = (datetime.datetime(date.year, date.month, 1) +
                datetime.timedelta(monthrange - 1)).strftime('%Y-%m-%d')
    moudle_sql = moudle_sql.replace('__DAY1__', str(date_start)).replace(
        '__DAY2__', str(date_end))
    return moudle_sql


if __name__ == "__main__":
    SparkContext.setSystemProperty("hive.metastore.uris",
                                   "thrift://hdp-0:9083")

    sparksession = (SparkSession.builder.appName(
        'device_increase').enableHiveSupport().getOrCreate())

    # spark-submit 提供时间类型参数,计算日,周,月 day,week,month
    parser = argparse.ArgumentParser()
    parser.add_argument('time_type')
    parser.add_argument('excute_day')  # add_argument()指定程序可以接受的命令行选项
    args = parser.parse_args()
    time_type = args.time_type
    excute_date = parse(str(args.excute_day))

    day_sql = """
    select 
    t1.*
Beispiel #26
0
from pyspark.sql import Row
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
from pyspark.sql import SparkSession, SQLContext
import redis
from pyspark import SparkConf, SparkContext
from pyspark import SparkContext
from config.config import KAFKA_NODES
SparkContext.setSystemProperty("spark.cassandra.connection.host",
                               'ec2-54-85-200-216.compute-1.amazonaws.com')
"""
    processParition will compute the spread among different exchanges
    and returns the best exchange at the discrete timestamp
"""


def processPartition(partition, table, keyspace, sc):
    if partition.isEmpty():
        return
    else:
        spark = SparkSession(sc)

        def f(accum, x):
            if ('asks' in list(accum.keys()) and 'asks' in list(x.keys())):
                if ((float(accum['bids']) - float(accum['asks'])) <
                    (float(x['bids']) - float(x['asks']))):
                    return accum
                else:
                    return x
Beispiel #27
0
import time
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from nvludfs import *

@nvl("(long,long,long)->long", tupargs=True)
def dot_prod(a,b,c):
  return 3*a+2*b+c

SparkContext.setSystemProperty("useNvl", "true")
SparkContext.setSystemProperty("offHeap", "true")
SparkContext.setSystemProperty("pythonNvl", "true")
conf = (SparkConf()
         .setMaster("local")
         .setAppName("udf_example")
         .set("spark.executor.memory", "2g"))
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
dot_udf = udf(dot_prod, LongType())
df = sqlContext.read.parquet("assembly/udf-test-s").cache()
times = []
for i in range(0, 11):
  t = time.time() 
  df.withColumn("udf", dot_udf(df['a'], df['b'], df['c'])).selectExpr("sum(udf)").show()
  times.append(time.time() - t)
print "average time: " + str(sum(times[1:])/10.0)
Beispiel #28
0
# -*- coding: utf-8 -*-
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, HiveContext
from pyspark.sql import functions as F
import pyspark.sql.types as T

SparkContext.setSystemProperty(
    "hive.metastore.uris",
    "thrift://bdcsceprod-bdcsce-1.compute-590737110.oraclecloud.internal:9083"
)  ##nn1:9083

#hive mestastore dir : /apps/hive/warehouse
sparkSession = (SparkSession.builder.appName(
    'hive_connection').enableHiveSupport().getOrCreate())
sparkSession.sparkContext.setLogLevel("ERROR")

hive_command = sparkSession.sql('USE MEDIX')

#%% actualizar las tablas STAGE con respecto al ciclo
#CUP = sparkSession.read.option('header','true').csv('hdfs:///user/oracle/medix/auditoria/CUP/cup_medibutil_rev.csv')
#for x in list(range(1,16)):
#    CUP = CUP.withColumn('val'+str(x),CUP['val'+str(x)].cast('Int'))

#%% catalogo
catalogo = sparkSession.sql('SELECT * FROM STAGE_AUDITORIA_CATALOGO')
rutas = sparkSession.sql('SELECT * FROM EXT_TABLE_AUDITORIA_RUTAS')
Ct = catalogo.alias('Ct')
catalogo = catalogo.join(rutas,Ct.clo_brick_num == rutas.ar_brick_num,how='left')\
                   .select(F.col('clo_brick_num').alias('Brick_num'),\
                           F.col('clo_cdg_postal_num').alias('Cdg_postal'),\
                           F.col('clo_tipo_asent').alias('Tipo_asentamiento'),\
Beispiel #29
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml import Pipeline
from pyspark.sql.functions import explode
from functools import reduce
from pyspark.sql import DataFrame

SparkContext.setSystemProperty('spark.driver.memory', '512m')
SparkContext.setSystemProperty('spark.driver.cores', '1')
SparkContext.setSystemProperty('spark.executor.memory', '2560m')  #2560m
SparkContext.setSystemProperty('spark.executor.cores', '8')

#SparkContext.setSystemProperty('spark.executor.memoryOverhead', '1536m')
SparkContext.setSystemProperty("spark.scheduler.mode", "FAIR")
SparkContext.setSystemProperty('spark.memory.fraction', '0.8')
SparkContext.setSystemProperty('spark.memory.storageFraction', '0.1')
SparkContext.setSystemProperty("spark.default.parallelism", "256")
SparkContext.setSystemProperty("spark.num.executors", "1")
SparkContext.setSystemProperty("spark.local.dir", "/tmp")

conf = SparkConf().setAppName('MoviesRec: Preditcions')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

sc.setCheckpointDir('/ML/movies/checkpoint/')

df = sqlContext.read.load(path='/ML/movies/data/*',
                          format='com.databricks.spark.csv',
                          delimiter=',',
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# Initialize Spark
SparkContext.setSystemProperty("spark.executor.memory", "4g")
conf = SparkConf()
conf.set("spark.executor.instances", 20)
sc = SparkContext("yarn-client", "kdd99", conf=conf)
hc = HiveContext(sc)

kdd = hc.table("kdd99")

(trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42)
trainData.cache()
services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct()
testData = testData.join(services, testData.service == services.srvc)
# filter out any rows with a service not trained upon
testData.cache()

print "training set has " + str(trainData.count()) + " instances"
print "test set has " + str(testData.count()) + " instances"

# Build model
inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat")
inx2 = StringIndexer(inputCol="service", outputCol="service-cat")
inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
	GatherStates()
	CalculateConditionScores()
	CalculateReadmissionAndDeathScore()
	CalculateFinalScores()
	SanityCheckScores()
	finalScores = sqlContext.sql("""
		SELECT
			st.State,
			s.ConditionScoreAverage,
			s.ReAdmissionAndDeathScore,
			(s.ConditionScoreAverage + s.ReAdmissionAndDeathScore) / 2 as FinalScore
		FROM
			score_data_tmp s
			JOIN states_tmp st on st.State = s.State
		ORDER BY
			FinalScore DESC
	""")
	finalScores.show(numberToShow, False)
	return

#
#	Main body
#

#	Set up spark environment, requesting a little more memory than the default
SparkContext.setSystemProperty('spark.executor.memory', '4g')
sc = SparkContext("local", "transformer app")
sqlContext = HiveContext(sc)

ShowTopStates(10)
# Students: A. Romriell, D. Wen, J. Pastor, J. Pollard
# MSAN 694 Project


from pyspark import SparkContext
SparkContext.setSystemProperty('spark.executor.memory', '45g')
sc = SparkContext("local", "arXiv")
from pyspark.mllib.clustering import PowerIterationClustering


def parse_meta(line):
	"""
	"""
	pieces = line.strip().split("|")
	return {"id": pieces[0], "subj": pieces[1], "dt": pieces[2], "title": pieces[3]}


def get_paper_subj(d):
	"""
	"""
	return (d["id"], d["subj"])


def parse_auth(line):
	"""
	"""
	paper_id, the_authors = line.strip().split(":", 1)
	authors = the_authors.strip().split("|")
	return {"id": paper_id, "authors": authors}

Beispiel #33
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")

from pyspark import SQLContext, HiveContext
from pyspark import SparkContext

import ta

if __name__ == "__main__":
    sc = SparkContext(appName="bintrade_candidate", master="yarn-client")
    sc.setSystemProperty("spark.driver.memory",     "1g")
    sc.setSystemProperty("spark.executor.memory",   "8g")
    sc.setSystemProperty("spark.executor.instances", "8")
    sc.setSystemProperty("spark.executor.cores",    "4")

    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")
    sqlContext.sql("use fex")

    ta.run(sc, sqlContext, isHive=True)
Beispiel #34
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")

from pyspark import SQLContext, HiveContext
from pyspark import SparkContext

import eod





if __name__ == "__main__":
    sc = SparkContext(appName="bintrade_candidate", master="yarn-client")
    sc.setSystemProperty("spark.driver.memory",     "1g")
    sc.setSystemProperty("spark.executor.memory",   "8g")
    sc.setSystemProperty("spark.executor.cores",    "2")

    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "16")
    sqlContext.sql("use fex")

    eod.run(sc, sqlContext, isHive=True)
Beispiel #35
0
def sc_start(app):
    global sc
    SparkContext.setSystemProperty("spark.port.maxRetries", "100")
    SparkContext.setSystemProperty("spark.ui.enabled", "false")
    SparkContext.setSystemProperty("spark.task.cpus", "2")
    SparkContext.setSystemProperty("spark.driver.memory", "100g")
    SparkContext.setSystemProperty("spark.driver.maxResultSize", "20g")
    SparkContext.setSystemProperty("spark.driver.cores", "4")
    SparkContext.setSystemProperty("spark.executor.instances", "25")
    sc = SparkContext.getOrCreate()
Beispiel #36
0
        client1.close()
        logger.add(
            'shopid={shopid} ,cid={cid} et_update_list get Subscription {c}.......'
            .format(shopid=shopid, cid=cid, c=len(filters)))

        merge_path = "/user/athena/{}/meb_attribute/{}".format(shopid, lsdate)
        if not util_hadoop.PathIsExit(merge_path):
            ls = util_hadoop.GetPath(merge_path.rsplit("/", 1)[0])
            lsdate = max(ls).rsplit('/', 1)[1]
            merge_path = "/user/athena/{}/meb_attribute/{}".format(
                shopid, lsdate)
            logger.add(
                'shopid={shopid} ,cid={cid} et_update_list get ta by last date {lsdate}.......'
                .format(shopid=shopid, cid=cid, lsdate=lsdate))

        SparkContext.setSystemProperty('spark.cores.max', '56')
        sc = SparkContext(appName="et_update_list_{}".format(shopid))

        def mapp(l):
            npt = l.split("\t")[12] if l.split("\t")[12] != 'cindy1' else 'S3'
            return {
                'memberid': l.split("\t")[1],
                'tag': l.split("\t")[4],
                'channel': [1, 0, 0],
                'npt': npt
            }

        member_ls = sc.textFile(merge_path, 30).filter(lambda l: l.split("\t")[0] == shopid and l.split("\t")[3] == "L7D" and l.split("\t")[1] in filters) \
                .map(lambda l: mapp(l)).collect()

        logger.add(
#!/usr/bin/env python

import matplotlib.pyplot as plot
import csv
from ast import literal_eval
from pyspark import SparkContext, SparkConf, StorageLevel
from operator import add

"""
--------------------------------------------------------
SPARK CONFIGURATION

Used only for standalone execution via bin/spark-submit
--------------------------------------------------------
"""
SparkContext.setSystemProperty("spark.executor.memory", "28g")
SparkContext.setSystemProperty("spark.default.parallelism", "500")

conf = (SparkConf()
        .setMaster("local")
        .setAppName("Uptime per machine")
        .set("spark.worker.memory", "28g")
        .set("spark.driver.memory", "28g")
        .set("spark.local.dir", "/Users/ksmuga/workspace/data/out"))
sc = SparkContext(conf = conf)


"""
--------------------------------------------------------
FIRST MAPPING TRANSFORMATION 
        df = pd.DataFrame(names, columns=["name", "count"])
        data = go.Data([go.Bar(x=df["name"], y=df["count"])])
        layout = go.Layout(title='Top 10 names who liked my posts most')
        fig = go.Figure(data=data, layout=layout)
        py.offline.plot(
            fig,
            filename="/Users/sunling/MUM/BDT/project/BGFacebook/output/test/" +
            fpa_conf.user + "_top10names.html",
            auto_open=False)


if __name__ == "__main__":

    sc = SparkContext(appName="CS523FinalProject")
    sc.setLogLevel("ERROR")
    sc.setSystemProperty("hive.metastore.uris", "")

    ssc = StreamingContext(sc, 10)
    print("start reading data from kafka...")
    kvs = KafkaUtils.createDirectStream(
        ssc, [fpa_conf.topic], {"metadata.broker.list": fpa_conf.brokers})
    parsed = kvs.map(lambda v: json.loads(v[1])).flatMap(
        lambda post: post.values())

    if parsed is not None:
        print("start analysising...")
        hc = getHiveContextInstance(sc)
        hc.sql("drop table if exists t_posts")
        posts = parsed.map(lambda r: (r['id'],r['message'],r['created_time'],\
            r['likes'],r['comment_count'],r['like_names']))
Beispiel #39
0
# -*- coding: utf-8 -*-
from pyspark import SparkConf
from pyspark import SparkContext

from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
import requests
import json

SparkContext.setSystemProperty('spark.executor.memory', '10g')
SparkContext.setSystemProperty("spark.executor.cores",'4')

class SparkHiveExample:

    def __init__(self):
        ## initialize spark session
        self.spark = SparkSession.builder.appName("Spark Hive example").enableHiveSupport().getOrCreate()

    def run(self):
        ## download with opendata API
        url = "http://data.coa.gov.tw/Service/OpenData/ODwsv/ODwsvTravelFood.aspx?"
        data = requests.get(url)

        ## convert from JSON to dataframe
        df = self.spark.createDataFrame(data.json())

        ## display schema
        df.printSchema()

        ## creates a temporary view using the DataFrame
        df.createOrReplaceTempView("travelfood")