Example #1
0
def get_context_test():
    conf = SparkConf()
    sc = SparkContext('local[1]', conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex_test""")
    sql_context.setConf("spark.sql.shuffle.partitions", "1")
    return sc, sql_context
Example #2
0
def table_schema_from_spark(hcat_table_name):
    #returns schema of table with this database.name in hcatalog
    #   (spark-workaround as long as hcatweb api is not available...)
    # initialize spark
    import findspark
    findspark.init()
     
    import pyspark
    from pyspark.sql import HiveContext
    
    sc_conf = pyspark.SparkConf()
    #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*')
    #sc_conf.set('spark.master','yarn-client')
    
    sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf)
    hc = HiveContext(sc)
    
    hive_schema = hc.table(hcat_table_name).schema.jsonValue()
    
    print hive_schema
    
    sc.stop()
    
    table_schema = {'columns':{}}
    
    col_sequence = 0
    for field in hive_schema['fields']:
        table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']}
        col_sequence += 1
    
    return table_schema
Example #3
0
def main():
    sc = SparkContext()
    hc = HiveContext(sc)

    df = hc.sql("""{{sql}}""")
    df_writer = DataFrameWriter(df)
    df_writer.saveAsTable(name='{{tableName}}',
                          format='json',
                          mode='overwrite',
                          path='s3://data/{{tableName}}')
Example #4
0
def get_context():
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="__file__", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex""")
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    return sc, sql_context
Example #5
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
def query12_no(query_name, conf=None):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
def query12_input(query_name, conf=None, output_persist=False):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

#    web_sales_sql = "select * from web_sales"
#    web_sales = sqlContext.sql(web_sales_sql)
#    web_sales.persist()
#    web_sales.registerAsTable("web_sales")
#    item_sql = "select * from item"
#    item = sqlContext.sql(item_sql)
#    item.persist()
#    item.registerAsTable("item")
#    date_dim_sql = "select * from date_dim"
#    date_dim = sqlContext.sql(date_dim_sql)
#    date_dim.persist()
#    date_dim.registerAsTable("date_dim")
    sqlContext.cacheTable("web_sales")
    sqlContext.cacheTable("item")
    sqlContext.cacheTable("date_dim")

    # discard the first query
    output = execute_sql(query_name, sqlContext, output_persist)
    # check the re-run statistics
    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
Example #8
0
def _test():
    import doctest
    import os
    import tempfile
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext, HiveContext
    import pyspark.sql.readwriter

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.readwriter.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')

    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['hiveContext'] = HiveContext._createForTesting(sc)
    globs['df'] = \
        globs['sqlContext'].read.format('text').stream('python/test_support/sql/streaming')

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.readwriter, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
    globs['sc'].stop()
    if failure_count:
        exit(-1)
Example #9
0
 def __init__(self, sc, debug=False):
     self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
     self.sc = sc
     # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
     self.hive_cxt = HiveContext(sc)
     self.sql_cxt  = SQLContext(sc)
     if debug:
         print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
         print sc._conf.getAll()
    def run(self):
	sc = SparkContext("local", "Course Activity")
	#sqlHC is the SQLHiveContext        
	sqlHC = HiveContext(sc)
	
	lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime,
		            eventType,eventName,eventNo from logdata where 
			    eventType not in ('enrollment','instructor','admin') 
			    and lmsUserId is not NULL 
   			    and courseName is not NULL 
			    and eventNo is not NULL limit 10""")


	maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4]))
	reduceRDD=maplvl1.reduceByKey(lambda a,b : a+b)
	with self.output().open('w') as out_file:
		for line in reduceRDD.collect():
        		out_file.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
Example #11
0
def ch9_sql():
    # Import Spark SQL
    from pyspark.sql import HiveContext, Row
    # Or if you can't include the hive requirements 
    from pyspark.sql import SQLContext, Row

    hiveCtx = HiveContext(sc)

    input_file = hiveCtx.read.json("testweet.json")
    # Register the input_file schema RDD 
    input_file.registerTempTable("tweets")
    # Select tweets based on the retweetCount
    topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM
      tweets ORDER BY retweetCount LIMIT 10""")

    topTweetText = topTweets.map(lambda row: row.text)  
    topTweetText.collect()

    topTweets.schema
    hiveCtx.cacheTable("tweets")
def run(inpath, outpath, mode='append'):
    
    gc.disable()
    print("===== Checking if Log Exists =====")
    check_log(inpath)
    print("===== Pass Log Checking =====")
    
    # initial SparkContext
    conf = SparkConf().setAppName("Forgate Log Parser")
    sc = SparkContext(conf=conf)
    sqlCtx = HiveContext(sc)
    start_time = time.time()
    print("===== INPUT FILE PATH: %s =====" % (str(inpath)))
    print("===== OUTPUT FILE PATH: %s =====" % (str(outpath)))
    print("===== %s Reading Data From HDFS" % (now()))
    distFile = sc.textFile(inpath)
    cnt_raw = distFile.count()
    print("===== Count of Input Data: %s =====" % (str(cnt_raw)))
    
    print("===== %s Parsing Data" % (now()))
    parsedData = parse_data(sc, distFile)
    print("===== Count of Parsed Data: %s =====" % (str(parsedData.count())))
    
    print("===== %s Saving Data" % (now()))
    jsonData = sqlCtx.jsonRDD(parsedData)
    old_col=['time','date']
    new_col=['time_','dt']
    jsonData = rename_column(jsonData, old_col, new_col)
    jsonData.write.partitionBy('dt').parquet(outpath, mode=mode)
    
    print("===== %s Checking Data" % (now()))
    confirm_row(sqlCtx, outpath)
    write_log(inpath)
    print("---Total took %s seconds ---" % (time.time() - start_time))
    
    sc.stop()
    gc.enable()
Example #13
0
        without this change they would appear as u'Foo' and 'Foo' in
        the final key string. Although python doesn't care about this
        difference, hadoop does, and will bucket the values
        separately. Which is not what we want.
        """
        # TODO: refactor this into a utility function and update jobs
        # to always UTF8 encode mapper keys.
        if len(values) > 1:
            return tuple([value.encode('utf8') for value in values])
        else:
            return values[0].encode('utf8')

    
sc = SparkContext("local", "Course Activity")
	#sqlHC is the SQLHiveContext        
sqlHC = HiveContext(sc)

lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime,
		            eventType,eventName,eventNo from logdata where 
			    eventType not in ('enrollment','instructor','admin') 
			    and lmsUserId is not NULL 
   			    and courseName is not NULL 
			    and eventNo is not NULL limit 100""")


maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4]))
for linet in maplvl1.collect():
	print linet

reduceRDD = maplvl1.reduceByKey(lambda a, b : a + b)
Example #14
0
from pyspark.sql import HiveContext

from pyspark.mllib.classification import SVMWithSGD, SVMModel, LogisticRegressionWithSGD

from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import col, sum

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

from copy import deepcopy

sc = SparkContext()
sqlContext = HiveContext(sc)
qry = """SELECT *,white/population as white_percent,
         black/population as black_percent,
         asian/population as asian_percent,
         pacific_islander/population as pi_percent,
         other_race/population as other_race_percent,
         multiple_race/population as multiple_percent,
         hispanic/population as hispanic_percent
         FROM census_rest_success where days_open > 365"""

df = sqlContext.sql(qry)

## Lets train a Support Vector Classifier on this data
#CITATION:
#http://stackoverflow.com/questions/33900726/count-number-of-non-nan-entries-in-each-column-of-spark-dataframe-with-pyspark
def count_not_null(c):
    return sum(col(c).isNotNull().cast("integer")).alias(c)
Example #15
0
def run(cfg, yesterday, model_name, model_version, serving_url):

    # os.environ[
    #     'PYSPARK_SUBMIT_ARGS'] = '--jars /home/reza/eshadoop/elasticsearch-hadoop-6.5.2/dist/elasticsearch-hadoop-6.5.2.jar pyspark-shell'

    es_write_conf = {"es.nodes": cfg['es_host'],
                     "es.port": cfg['es_port'],
                     "es.resource": cfg['es_predictions_index']+'/'+cfg['es_predictions_type'],
                     "es.batch.size.bytes": "1000000",
                     "es.batch.size.entries": "100",
                     "es.input.json": "yes",
                     "es.mapping.id": "uckey",
                     "es.nodes.wan.only": "true",
                     "es.write.operation": "upsert"}

    sc = SparkContext()
    hive_context = HiveContext(sc)
    forecaster = Forecaster(cfg)
    sc.setLogLevel(cfg['log_level'])

    # Reading the max bucket_id
    bucket_size = cfg['bucket_size']
    bucket_step = cfg['bucket_step']
    factdata = cfg['factdata']
    distribution_table = cfg['distribution_table']
    norm_table = cfg['norm_table']
    traffic_dist = cfg['traffic_dist']

    model_stats = get_model_stats(cfg, model_name, model_version)

    # Read dist
    command = """
        SELECT
        DIST.uckey,
        DIST.ratio,
        DIST.cluster_uckey,
        DIST.price_cat
        FROM {} AS DIST
        """.format(distribution_table)
    df_dist = hive_context.sql(command)

    # Read norm table
    # DataFrame[uckey: string, ts: array<int>, p: float, a__n: float, a_1_n: float, a_2_n: float, a_3_n: float, a_4_n: float, a_5_n: float, a_6_n: float, t_UNKNOWN_n: float, t_3G_n: float, t_4G_n: float, t_WIFI_n: float, t_2G_n: float, g__n: float, g_g_f_n: float, g_g_m_n: float, g_g_x_n: float, price_cat_1_n: float, price_cat_2_n: float, price_cat_3_n: float, si_vec_n: array<float>, r_vec_n: array<float>, p_n: float, ts_n: array<float>]
    command = """
        SELECT
        uckey AS cluster_uckey,
        price_cat,
        a__n,a_1_n,a_2_n,a_3_n,a_4_n,a_5_n,a_6_n,
        t_UNKNOWN_n,t_3G_n,t_4G_n,t_WIFI_n,t_2G_n,
        g__n, g_g_f_n, g_g_m_n, g_g_x_n,
        price_cat_1_n, price_cat_2_n, price_cat_3_n,
        si_vec_n,
        r_vec_n
        FROM {}
        """.format(norm_table)
    df_norm = hive_context.sql(command)
    # df_norm = df_norm.groupBy('cluster_uckey', 'a__n', 'a_1_n', 'a_2_n', 'a_3_n', 'a_4_n', 'a_5_n', 'a_6_n', 't_UNKNOWN_n',
    #                           't_3G_n', 't_4G_n', 't_WIFI_n', 't_2G_n', 'g__n', 'g_g_f_n', 'g_g_m_n', 'g_g_x_n', 'si_vec_n').count().drop('count')


    # create day_list from yesterday for train_window
    duration = model_stats['model']['duration']
    day = datetime.strptime(yesterday, '%Y-%m-%d')
    day_list = []
    for _ in range(0, duration):
        day_list.append(datetime.strftime(day, '%Y-%m-%d'))
        day = day + timedelta(days=-1)
    day_list.sort()

    df_prediction_ready = None
    df_uckey_cluster = None
    start_bucket = 0

    while True:

        end_bucket = min(bucket_size, start_bucket + bucket_step)

        if start_bucket > end_bucket:
            break

        # Read factdata table
        command = """
        SELECT
        FACTDATA.count_array,
        FACTDATA.day,
        FACTDATA.hour,
        FACTDATA.uckey
        FROM {} AS FACTDATA
        WHERE FACTDATA.bucket_id BETWEEN {} AND {}
        """.format(factdata, str(start_bucket), str(end_bucket))

        start_bucket = end_bucket + 1

        df = hive_context.sql(command)

        # [Row(count_array=[u'1:504'], day=u'2019-11-02', hour=2, uckey=u'magazinelock,04,WIFI,g_m,1,CPM,78', hour_price_imp_map={2: [u'1:504']})]
        df = df.withColumn('hour_price_imp_map',
                           expr("map(hour, count_array)"))

        # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', hour_price_imp_map_list=[{15: [u'3:3']}, {7: [u'3:5']}, {10: [u'3:3']}, {9: [u'3:1']}, {16: [u'3:2']}, {22: [u'3:11']}, {23: [u'3:3']}, {18: [u'3:7']}, {0: [u'3:4']}, {1: [u'3:2']}, {19: [u'3:10']}, {8: [u'3:4']}, {21: [u'3:2']}, {6: [u'3:1']}])]
        df = df.groupBy('uckey', 'day').agg(
            collect_list('hour_price_imp_map').alias('hour_price_imp_map_list'))

        # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', day_price_imp=[u'3:58'])]
        df = df.withColumn('day_price_imp', udf(
            sum_count_array, ArrayType(StringType()))(df.hour_price_imp_map_list)).drop('hour_price_imp_map_list')

        # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', day_price_imp=[u'3:58'], day_price_imp_map={u'2019-11-02': [u'3:58']})]
        df = df.withColumn('day_price_imp_map', expr(
            "map(day, day_price_imp)"))

        # [Row(uckey=u'native,z041bf6g4s,WIFI,g_f,1,CPM,71', day_price_imp_map_list=[{u'2019-11-02': [u'1:2', u'2:261']}, {u'2019-11-03': [u'2:515']}])])
        df = df.groupBy('uckey').agg(collect_list(
            'day_price_imp_map').alias('day_price_imp_map_list'))

        # [Row(uckey=u'native,z041bf6g4s,WIFI,g_f,1,CPM,71', day_price_imp_map_list=[{u'2019-11-02': [u'1:2', u'2:261']}, {u'2019-11-03': [u'2:515']}], ratio=0.09467455744743347, cluster_uckey=u'892', price_cat=u'1')]
        df = df.join(df_dist, on=['uckey'], how='inner')

        # df_uckey_cluster keeps the ratio and cluster_key for only uckeys that are being processed
        if not df_uckey_cluster:
            df_uckey_cluster = df.select(
                'uckey', 'cluster_uckey', 'ratio', 'price_cat')
            df_uckey_cluster.cache()
        else:
            df_uckey_cluster = df.select(
                'uckey', 'cluster_uckey', 'ratio', 'price_cat').union(df_uckey_cluster)
            df_uckey_cluster.cache()

        # [Row(cluster_uckey=u'2469', price_cat=u'2', cluster_day_price_imp_list=[[{u'2019-11-02': [u'2:90']}, {u'2019-11-03': [u'2:172']}]])])
        df = df.groupBy('cluster_uckey', 'price_cat').agg(
            collect_list('day_price_imp_map_list').alias('cluster_day_price_imp_list'))

        df = df.withColumn('ts', udf(sum_day_count_array,
                                     ArrayType(MapType(StringType(), ArrayType(StringType()))))(df.cluster_day_price_imp_list))

        # [Row(cluster_uckey=u'2469', price_cat=u'2', ts=[{u'2019-11-02': [u'2:90'], u'2019-11-03': [u'2:172']}])]
        df = df.drop('cluster_day_price_imp_list')

        if not df_prediction_ready:
            df_prediction_ready = df
            df_prediction_ready.cache()
        else:
            df = df_prediction_ready.union(df)
            df = df.groupBy('cluster_uckey', 'price_cat').agg(
                collect_list('ts').alias('ts_list'))
            df = df.withColumn('ts', udf(sum_day_count_array,
                                         ArrayType(MapType(StringType(), ArrayType(StringType()))))(df.ts_list))
            df = df.drop('ts_list')

            # [Row(cluster_uckey=u'magazinelock,03,WIFI,g_f,1,CPM,60', ts=[{u'2019-11-02': [u'1:2']}])]
            df_prediction_ready = df
            df_prediction_ready.cache()

    # [Row(cluster_uckey=u'1119', price_cat=u'2', ts=[{u'2019-11-02': [u'1:862', u'3:49', u'2:1154'], u'2019-11-03': [u'1:596', u'3:67', u'2:1024']}])]
    df = df_prediction_ready

    df = df.join(df_norm, on=['cluster_uckey', 'price_cat'], how='inner')

    # [Row(cluster_uckey=u'1119', price_cat=u'2', ts=[{u'2019-11-02': [u'1:862', u'3:49', u'2:1154'], u'2019-11-03': [u'1:596', u'3:67', u'2:1024']}], a__n=-0.005224577616900206, a_1_n=0.6089736819267273, a_2_n=-0.21013110876083374, a_3_n=0.16884993016719818, a_4_n=-0.3416250944137573, a_5_n=0.15184317529201508, a_6_n=-0.16529197990894318, t_UNKNOWN_n=-0.4828081429004669, t_3G_n=1.2522615194320679, t_4G_n=-0.15080969035625458, t_WIFI_n=-0.35078370571136475, t_2G_n=1.991615653038025, g__n=-0.08197031915187836, g_g_f_n=0.010901159606873989, g_g_m_n=-0.21557298302650452, g_g_x_n=1.4449801445007324, price_cat_1_n=-1.2043436765670776, price_cat_2_n=1.885549783706665, price_cat_3_n=-0.48205748200416565, si_vec_n=[-0.20294927060604095, -0.27017056941986084, -0.16821187734603882, -0.20294314622879028, -0.11777336895465851, 0.9738097786903381, 0.23326143622398376, -0.16500996053218842, -0.19148004055023193, -0.15753313899040222, -0.149298757314682, -0.19954630732536316, -0.15968738496303558, 0.12466698884963989, -0.15369804203510284, 0.04789407551288605, -0.22501590847969055, 0.14411255717277527, -0.209896981716156, -0.17969290912151337, 0.06794296950101852, -0.12367484718561172, 0.5581679344177246, 0.8108972311019897, -0.20487570762634277, 2.597964286804199, -0.2720063328742981, 0.1152268648147583, 0.27174681425094604, -0.20653237402439117, -0.2899857461452484, -0.15441325306892395, -0.17766059935092926, -0.11622612923383713, 0.3738412857055664, 1.0858312845230103, 0.6114567518234253], r_vec_n=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5', ratio=0.11989551782608032)]
    df = df.join(df_uckey_cluster, on=[
                 'cluster_uckey', 'price_cat'], how='inner')

    predictor_udf = udf(transform.predict_daily_uckey(days=day_list,
        serving_url=serving_url, forecaster=forecaster, model_stats=model_stats, columns=df.columns), MapType(StringType(), FloatType()))

    df = df.withColumn('day_prediction_map',
                       predictor_udf(struct([df[name] for name in df.columns])))

    # [Row(cluster_uckey=u'1119', price_cat=u'2', day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0}, ratio=0.11989551782608032, uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5')]
    df = df.select('cluster_uckey', 'price_cat',
                   'day_prediction_map', 'ratio', 'uckey')

    # [Row(ucdoc_elements=Row(price_cat=u'2', ratio=0.11989551782608032, day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0}), uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5')]
    ucdoc_elements_type = StructType([StructField('price_cat', StringType(), False), StructField(
        'ratio', FloatType(), False), StructField('day_prediction_map', MapType(StringType(), FloatType()), False)])
    df = df.withColumn('ucdoc_elements_pre_price_cat', udf(lambda price_cat, ratio, day_prediction_map:
                                                           (price_cat, ratio, day_prediction_map), ucdoc_elements_type)(df.price_cat, df.ratio, df.day_prediction_map)).select('ucdoc_elements_pre_price_cat', 'uckey')

    # [Row(uckey=u'splash,d971z9825e,WIFI,g_m,1,CPT,74', ucdoc_elements=[Row(price_cat=u'1', ratio=0.5007790923118591, day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0})])]
    df = df.groupBy('uckey').agg(collect_list('ucdoc_elements_pre_price_cat').alias('ucdoc_elements'))

    df = df.withColumn('prediction_output', udf(transform.generate_ucdoc(traffic_dist), StringType())(
        df.uckey, df.ucdoc_elements))

    df_predictions_doc = df.select('uckey', 'prediction_output')
    rdd = df_predictions_doc.rdd.map(lambda x: transform.format_data(x, 'ucdoc'))
    rdd.saveAsNewAPIHadoopFile(
        path='-',
        outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
        keyClass="org.apache.hadoop.io.NullWritable",
        valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
        conf=es_write_conf)

    sc.stop()
def main():
    # set up the logger
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'),
                            level=logging.INFO,
                            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # NSJOIN dayidx # only partitioned by DAY
    day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1]
    # BAREBONES dayidx # only partitioned by DAY
    day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x]
    # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid)
    mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/')
    [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data]

    if day_idx != day_mps:
        logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.')
        return

    if len(day_bb) == 0:
        logger.warning('mapper.barebone data missing for this particular day.')
        #return

    logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

    logger.info('begin spark process.')
    getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx)
    geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx
    geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx

    sc = SparkContext()
    hiveCtx = HiveContext(sc)

    rows = hiveCtx.sql(getting_mappoint_data)

    regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx))
    geo_total_cap = hiveCtx.sql(geo_total_cap_query)
    geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query)


    # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]]
    region_mpginfo_pair = rows.map(lambda x: [[x.mpgid,
                                               x.lat,
                                               x.lon,
                                               x.country,
                                               x.mpgload,
                                               x.asnum,
                                               x.ip], x.allowed_private_regions])\
                                .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]])

    #region_mpginfo_pair.first()

    # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]]
    # ps. prp=1: private, prp=0: public
    region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude,
                                                          x.longitude,
                                                          x.peak_bitcap_mbps,
                                                          x.peak_flitcap_mfps,
                                                          x.country,
                                                          x.numvips,
                                                          'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'),
                                                          1 if x.prp=='private' else 0]])\
                                .filter(lambda x: x[1][6]=='W' or x[1][6]=='S')

    region_public_list = region_latlon\
        .filter(lambda x: x[1][7] == 0)\
        .map(lambda x: ('all', [[x[0]]]))\
        .reduceByKey(lambda a, b: [a[0]+b[0]])\
        .map(lambda x: x[1][0]).collect()

    region_public_list = [0] + sorted(region_public_list[0])

    # dummy region
    rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])])
    region_latlon = region_latlon.union(rdd2)

    # perform the join into tuple of (K, (V1, V2):
    # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service]))
    # rdd  = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance],
    #               reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services,
    #               mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #               mpg-lat, mpg-lon)
    mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0],
                                                                           x[0],
                                                                           geodesic_distance(x[1][0][1],
                                                                                             x[1][0][2],
                                                                                             x[1][1][0],
                                                                                             x[1][1][1]),
                                                                           round(float(x[1][1][2])/1000.0, 3),
                                                                           round(float(x[1][1][3])/1000.0, 3),
                                                                           x[1][1][4], # reg-country
                                                                           x[1][1][5], # reg-numvips
                                                                           x[1][1][6], # reg-services
                                                                           x[1][0][3],
                                                                           x[1][0][4],
                                                                           x[1][0][5],
                                                                           x[1][0][6],
                                                                           x[1][0][1],
                                                                           x[1][0][2]])

    # filtering on mapping distance < 500 miles
    # filtering on reg-country = mpg-country
    # filtering on region capacity fbps > 1Gbps
    # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #                        mpg-lat, mpg-lon])
    #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\
    #    .filter(lambda x: x[5] == x[8])\
    #    .filter(lambda x: x[3] > 1)\
    #    .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    # or this one, no-same-country constraint:
    mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\
        .filter(lambda x: x[3] > 1)\
        .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    #mpgid_reg_distance.first()

    # group by mpgid
    # rdd format = (mpgid, [[reg-list],
    #                       avg_distance,
    #                       total_cap freeflow,
    #                       total_cap essl,
    #                       total num vips,
    #                       rg_count,
    #                       mpg-country,
    #                       mpg-load,
    #                       [mpg-asnum],
    #                       [mpg-nsip])
    mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\
        .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4],
                                   a[5], a[6], a[7], a[8], a[9], a[10]])\
        .map(lambda x: (x[0], [sorted(x[1][0]), # region_list
                               round(x[1][1]/x[1][4], 2), # avg distance
                               round(x[1][2][0], 2), # total capacity - w
                               round(x[1][2][1], 2), # total capacity - s
                               x[1][3], # numvips
                               x[1][4], # total region count
                               x[1][5], # mpg country
                               x[1][6], # mpg load
                               x[1][7], # mpg asnum
                               x[1][8], # mpg nsip
                               x[1][9], # mpg lat
                               x[1][10]])) # mpg lon

    # disable the count
    #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count()

    # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips
    #                           reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]])
    reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\
        .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list
                                          x[1][1], # avg_distance
                                          x[1][2], # region total capacity freeflow
                                          x[1][3], # region total capacity essl
                                          x[1][4], # total num vips
                                          x[1][5], # total region count
                                          [x[1][6]], # mpg country list
                                          x[1][7], # mpg load
                                          1, # mpg-count
                                          x[1][8] if x[1][8] else [], # [mpg-asnum]
                                          x[1][9] if x[1][9] else [], # [mpg-nsip]
                                          [x[1][10]], # [mpg-lat] # single element array
                                          [x[1][11]], # [mpg-lon] # single element array
                                          [x[1][7]] # [mpg-load] # single element array
                                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0],
                                   a[1],
                                   a[2],
                                   a[3],
                                   a[4],
                                   a[5],
                                   a[6]+b[6],
                                   a[7]+b[7],
                                   a[8]+b[8],
                                   a[9]+b[9],
                                   a[10]+b[10],
                                   a[11]+b[11],
                                   a[12]+b[12],
                                   a[13]+b[13]])\
        .filter(lambda x: sum(x[1][13]) > 0.0001)\
        .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list
                               x[1][1], # avg_distance
                               x[1][2], # reg-cap-w
                               x[1][3], # reg-cap-s
                               x[1][4], # numvips
                               x[1][5], # reg-count
                               [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list
                               x[1][7], # mpg-load
                               x[1][8], # mpg-count
                               [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum]
                               [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip]
                               geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma]
                               ]))\
        .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list]
                        x[1][1], # avg_distance
                        x[1][2], # reg-cap-w
                        x[1][3], # reg-cap-s
                        x[1][4], # numvips
                        x[1][5], # reg-count
                        x[1][7], # mpg-load
                        x[1][8], # mpg-count
                        ':'.join([str(y) for y in x[0]]), # [region-list]
                        ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list]
                        ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum]
                        ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip]
                        x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma]
                        ],
                        region_public_list
                        ))\
        .flatMapValues(lambda x: x)\
        .map(lambda x: [x[1], x[0]])

    reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\
        .join(region_latlon)\
        .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0],
                                                             x[1][0][12][1],
                                                             x[1][1][0],
                                                             x[1][1][1])] + [x[0]] if x[0] > 0\
             else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0],
                                       x[1][0][12][1],
                                       x[1][1][0],
                                       x[1][1][1],
                                       0.0]] + [x[0]])\
        .filter(lambda x: x[2][4] < 500)\
        .map(lambda x: (tuple([x[0][0],
                              x[0][1],
                              x[0][2],
                              x[0][3],
                              x[0][4],
                              x[0][5],
                              x[0][6],
                              x[0][7],
                              x[0][8],
                              x[0][9],
                              x[0][10],
                              x[0][11],
                              x[0][12][0],
                              x[0][12][1],
                              x[0][12][2],
                              x[0][12][3]]), # mpg-information
                        [x[1][2], # pub.region.cap.ff
                         x[1][3], # pub.region.cap.essl
                         x[1][5], # pub.region.vip
                         [x[3]] # single element region id
                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff )
                                   a[1]+b[1], # sum( pub.region.cap.essl )
                                   a[2]+b[2], # sum( pub.region.cap.vip )
                                   a[3]+b[3] # [pub.regions]
                                   ])\
        .map(lambda x: [x[0][0], # [mpg-country-list]
                        x[0][1], # avg-distance
                        x[0][12], # mpg-lat
                        x[0][13], # mpg-lon
                        x[0][14], # mpg-por
                        x[0][15], # mpg-porsigma
                        x[0][2], # pri.region.cap.ff (gbps)
                        x[0][3], # pri.region.cap.essl (gbps)
                        x[0][4], # pri.vips
                        x[0][5], # pri.region.count
                        round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps)
                        round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps)
                        x[1][2], # pub.vips
                        len(x[1][3])-1, # pub.region.count
                        x[0][6], # mpg-load
                        round(x[0][7], 6), # mpg-count
                        x[0][8], # [pri reg-list]
                        ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list])
                        x[0][9], # [mpg-list]
                        x[0][10], # [mpg-assum]
                        x[0][11] # [mpg-nsip]
                        ])

    # data exporting to local
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(columns=['cl_geoname',
                                                                                           'cl_avgDistance',
                                                                                           'cl_lat',
                                                                                           'cl_lon',
                                                                                           'cl_por',
                                                                                           'cl_porsigma',
                                                                                           'pri_cap_ff_gbps',
                                                                                           'pri_cap_essl_gbps',
                                                                                           'pri_nvips',
                                                                                           'pri_nReg',
                                                                                           'pub_cap_ff_gbps',
                                                                                           'pub_cap_essl_gbps',
                                                                                           'pub_nvips',
                                                                                           'pub_nReg',
                                                                                           'cl_mpgLoad',
                                                                                           'cl_nMpg',
                                                                                           'pri_regList',
                                                                                           'pub_regList',
                                                                                           'mpgList',
                                                                                           'mpgASList',
                                                                                           'mpgNSIPList'])

    geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect()

    logger.info('begin write to local disk.')
    for item in range(len(geo_cluster_full_info)):
        temp = geo_cluster_full_info[item]
        country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[item] = temp # the above should be temp[1][0] for the mpglist

    data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data'
    filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx)
    fileDestination = os.path.join(data_folder, filename)
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(fileDestination,
                                                                          sep=',', index=False, header=False)

    logger.info('begin to upload to hdfs.')
    tablename = 'mrqos.mpg_cluster'
    hdfs_d = os.path.join(config.hdfs_table,
                          'mpg_cluster',
                          'datestamp=%s' % day_idx,
                          'uuid=%s' % uuid_idx)
    partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx)
    processed_filename = '.'.join(filename.split('.')[0:-1])+'.processed.csv'
    cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (os.path.join(data_folder, filename),
                                                                                                                                                                                                                                                              os.path.join(data_folder, processed_filename))
    sp.check_call(cmd_str, shell=True)
    try:
        beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger)
        # os.remove(fileDestination)
    except sp.CalledProcessError as e:
        logger.info('upload to HDFS + update Hive table failed.')
Example #17
0
    def __init__(self, 
                 data_source='local', 
                 activity_directory='strava-activities-subset',
                 s3bucket='larsbk',
                 athletes=None,
                 activity_types=[
                    'Ride',
                    'Run',
                    'NordicSki'
                 ],
                 sc=None,
                 hiveContext=None,
                 conf=(SparkConf().setAppName('Strava analysis')),
                 filter_bug_inducing_rows=True
                 ):

        ''' Initialize Strava Analysis object'''


        # INPUT PARAMETERS

        self.athletes = athletes # Athletes to analyze (optional)
        self.activity_types = activity_types # Activity_types to consider (default)
        self.filter_bug_inducing_rows = filter_bug_inducing_rows


        # CONFIGURE SPARK

        if sc != None and hiveContext != None: # Both contexts were supplied by user
            print 'Info: Using supplied SparkContext and HiveContext'
            self.sc = sc
            self.hiveContext = hiveContext

        else: # Initialize new contexts
            print 'Info: Intitializing SparkContext and hiveContext from (default) conf'
            self.sc = SparkContext(conf=conf)
            self.hiveContext = HiveContext(self.sc)

        self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema
        self.df = None # Empry DataFrame to be populated later


        # CONFIGURE DATA SOURCE

        data_root_path = {
                's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 
                'local': './%s/' % activity_directory
        }
        
        if data_source not in data_root_path.keys(): # Check if data source is valid 
            raise Exception(('Unrecognized data source %s. '
                             'Supported sources: "%s".') \
                             % '", "'.join(data_root_path.keys()))
        
        self.data_source = data_source # This is a valid data source
        self.path = data_root_path[data_source] # This is the path to the data


        # (S3 SPECIFIC STUFF)

        if data_source == 's3':

            # Get a list of files in he activity_directorys
            bucket = boto3.resource('s3').Bucket(s3bucket) 
            objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory)
            files = [obj.key for obj in objects] 

            # Make set of observed combinations of athlete and activity_type
            athlete_and_type = set([]) # Empty set to populate
            fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern
            for fname in files:
                match = re.match(activity_directory+'/gpx'+fpattern, fname)
                if match:
                    athlete_and_type.add((match.group(1), match.group(2)))

            self.s3_athlete_and_type = athlete_and_type # Save set for later use

        pass
Example #18
0
            if i + 5 < len(x): dx["5day"] = int(x[i + 5].close / dx["high"])
        else:
            dx["1day"] = -1
            dx["2day"] = -1
            dx["3day"] = -1
            dx["4day"] = -1
            dx["5day"] = -1
            dx["label"] = 0
        l.append(dx)
    return l


def main(sc, sqlContext):
    dfSC = sqlContext.read.table("eod2")
    rddMat = cal(sc, sqlContext, dfSC)
    df = sqlContext.createDataFrame(rddMat)
    dfToTable(sqlContext, df, "ta_gupbreak")


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName=__file__, conf=conf)
    #sc = SparkContext("local[4]", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("use fex")
    main(sc, sql_context)
    sc.stop()
Example #19
0
# encoding: utf-8

from pyspark.ml.pipeline import PipelineModel 
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import HiveContext
from pyspark import SparkContext

sc = SparkContext(appName = "Eval Model App")
log4jLogger = sc._jvm.org.apache.log4j 
log = log4jLogger.LogManager.getLogger(__name__) 

sqlContext = HiveContext(sc)

log.warn("Lectura de la informacion")
df = sqlContext.read.json("/raw/msmk")
df.registerTempTable("jsons")
sqlContext.sql("""
    select retweet_count, count(*) as total
    from jsons
    group by retweet_count
    order by 2 desc
""").show()

log.warn("Aplicación del modelo")
model = PipelineModel.load("/models/best_model")

dataset = sqlContext.sql("""
    select  id as _id,
        text, 
        retweet_count,
        user.screen_name as user
Example #20
0
from pyspark.sql.functions          import *
from pyspark.sql                    import HiveContext
from pyspark                        import SparkContext

##################################################################################################################################################################

##################################################################################################################################################################
#Set arguments
dicParameteres = {}
dicParameteres['arg1'] = argv[1] # arg1: argumento 1 vindo da shell

#Create context
yarn_name = "NOME_QUE_IRA_APARECER_NO_YARN"
sc = SparkContext('yarn-cluster', yarn_name)

sqlContx = HiveContext(sc)
sqlContx.setConf('spark.sql.parquet.compression.codec', 'snappy')
sqlContx.setConf('hive.exec.dynamic.partition', 'true')
sqlContx.setConf('hive.exec.dynamic.partition.mode', 'nonstrict')

#imprimindo conteúdo no dicionário criado
for key, value in dicParameters.items():
    if not key:
        print('Configuracao inexistente para {0} : {1}'.format(key, value))
    else:
        print('Parametro {0} : {1}'.format(key, value))    
##################################################################################################################################################################
    
##################################################################################################################################################################
def startProcess(dicParameters):
Example #21
0
class SparkOperator(DBOperator):
    """SPARK操作类"""
    def __init__(self, database_cat, database_str, logger):
        '''\
        初始化一个连接
        '''
        DBOperator.__init__(self, database_cat, database_str, logger)

        script_name = __name__
        conf = SparkConf().setAppName(script_name).setMaster("yarn")

        conf.set('spark.executor.instances', '5')
        conf.set('spark.executor.cores', '1')
        conf.set('spark.executor.memory', '2g')
        conf.set('spark.driver.memory', '1g')
        conf.set('spark.sql.shuffle.partitions', '5')
        conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
        conf.set("hive.exec.max.dynamic.partitions", "100000")
        conf.set("hive.exec.max.dynamic.partitions.pernode", "100000")
        self.sc = SparkContext(conf=conf)
        self.hc = HiveContext(self.sc)
        self.logger = logger
        self.logger.info(u'SPARK 连接成功。')

    def excute_sql(self, sql_str, log_flag='1'):
        '''\
        Function: 在SPARK中执行SQL语句函数
        Date    : 2018-03-12 16:01:29
        Author  : Liuym
        Notes   : 传入SQL语句字符串,发送到SPARK SQL中执行,返回是否成功标识及日志信息
        Argvs   : sql_str -- QL语句字符串
        Result  : msg_code -- 0-成功,-1-失败
                  msg_info -- 日志信息
        '''
        DBOperator.excute_sql(self, sql_str, log_flag)

        #若SQL语句为空,跳过实际执行。
        if self.msg_code == 9:
            return (0, self.msg_info)

        try:
            self.hc.sql(self.sql_str)
        except Exception as e:
            self.msg_code = -1
            self.msg_info = u"SQL语句执行错误:" + str(e).replace("\'", "\"")[0:3000]
        else:
            self.msg_code = 0
            self.msg_info = u"脚本运行成功"

        return (self.msg_code, self.msg_info)

    def get_results(self, sql_str, log_flag='1'):
        '''\
        Function: 获取SPARK SQL 中结果集
        Date    : 2018-10-11 16:01:29
        Author  : Liuym
        Notes   : 传入SQL语句字符串,发送到SPARK SQL中执行,返回是否成功标识,日志信息及结果集
        Argvs   : sql_str -- QL语句字符串
        Result  : msg_code -- 0-成功,-1-失败
                  msg_info -- 日志信息
                  results  -- 结果集
        '''
        DBOperator.get_results(self, sql_str, log_flag)

        #若SQL语句为空,跳过实际执行。
        if self.msg_code == 9:
            return (0, self.msg_info)

        try:
            self.results = self.hc.sql(self.sql_str)  #从SPARK中取得一个DataFrame
            #self.results.cache() # 持久化当前DataFrame
            self.record_list = self.results.rdd.map(
                lambda line: line.asDict().values()).collect()
        except Exception as e:
            self.msg_code = -1
            self.msg_info = u"SQL语句执行错误:" + str(e).replace("\'", "\"")[0:3000]
        else:
            self.msg_code = 0
            self.msg_info = u"脚本运行成功"

        return (self.msg_code, self.msg_info, self.record_list)

    def close_connection(self):
        '''\
        Function: 关闭数据库的连接
        Date    : 2018-10-11 16:01:29
        Author  : Liuym
        Notes   : 关闭数据库的连接,释放资源
        '''
        DBOperator.close_connection(self)
        self.sc.stop()
        self.logger.info(u'SPARK 断开连接。')
Example #22
0
def main():
    sc = SparkContext(conf=SparkConf().setAppName("wil_hot_sku_calc_online"))
    hc = HiveContext(sc)
    sc.addPyFile(sys.argv[1])
    from core.common import common
    param = common.init_params(sys.argv, p)
    date, pid, ts, dc_id = param["date"], param["pid"], param["ts"], param[
        "dc_id"]
    today = dt.datetime.strptime(date, "%Y-%m-%d").date()
    yesterday = today - dt.timedelta(1)
    # 候选以及选入爆品仓的sku
    sql_sku_all = """
            select distinct t.sku_id as sku_id,
            t.current_source_store_id as store_id,
            t.future_source_store_id as future_store_id,
            case when t.b_sku_id is not null then 1 else 0 end as is_selected
            from
            (select a.sku_id as sku_id,
                   a.current_source_store_id as current_source_store_id,
                   a.future_source_store_id as future_source_store_id,
                   b.sku_id as b_sku_id
            from app.app_wil_hot_sku_all a
            left join
            app.app_wil_hot_sku_selected b
            on a.sku_id = b.sku_id
            ) t
    """
    hc.sql(sql_sku_all).createOrReplaceTempView("tb_sku_all")

    # 所有的候选sku关联的订单数据
    sql_data_all = """
        select a.sku_id as sku_id,
               a.future_store_id as future_store_id,
               a.is_selected as is_selected,
               b.out_wh_tm as out_wh_tm,
               b.ord_id as ord_id,
               b.sale_qtty as sale_qtty
        from
            (select sku_id,
                     store_id,
                    future_store_id,
                    is_selected
            from tb_sku_all
            ) a
        left join
            (select sku_id,
                   store_id,
                   out_wh_tm,
                   parent_sale_ord_id as ord_id,
                   sale_qtty
            from app.app_wil_hot_sale_store_sku_ord_sale
            where dt = '""" + str(yesterday) + """'
                and sale_qtty >= 0
            ) b
        on a.store_id = b.store_id
            and a.sku_id = b.sku_id
    """
    hc.sql(sql_data_all).createOrReplaceTempView("tb_data_all")
    # 标注订单
    sql_choice_ord = """
        select a.ord_id,
               case when (a.f1=1 and a.f2=1) or (a.f1=2 and a.f2=1) or (a.f1=2 and a.f2=2 and (a.g1=2 or (a.g2=2 and a.g3=1))) then 1 else 0 end as choice_flag2
        from
            (select ord_id,
                count(distinct future_store_id) as f1,
                count(distinct is_selected) as f2,
                count(distinct future_store_id,is_selected) as g1,
                count(distinct (case when is_selected = 1 then future_store_id else null end)) as g2,
                count(distinct (case when is_selected = 0 then future_store_id else null end)) as g3
            from tb_data_all
            group by ord_id
            ) a
    """
    hc.sql(sql_choice_ord).createOrReplaceTempView("tb_choice_ord")

    # 统计销量
    sql_result = """
        select a.sku_id,
               sum(case when b.choice_flag2 = 1 then a.sale_qtty else 0 end) as sale_in_hot,
               sum(a.sale_qtty) as sale_total
        from tb_data_all a
        left join
            tb_choice_ord b
        on a.ord_id = b.ord_id
        where a.is_selected = 1
        group by a.sku_id
    """
    hc.sql(sql_result).createOrReplaceTempView("tb_result")

    # 默认的分配比例
    sql_selected_sku_info = """
        select a.sku_id as sku_id,
               b.dc_id as dc_id,
               b.hot_sku_target_store_id as hot_sku_target_store_id,
               b.future_source_store_id as future_source_store_id,
               b.default_ratio as default_ratio
        from app.app_wil_hot_sku_selected a
        left join 
            (select sku_id,
                    dc_id,
                    hot_sku_target_store_id,
                    future_source_store_id,
                    avg(hot_sku_out_store_rate) as default_ratio
            from app.app_wil_hot_sku_all
            group by sku_id,dc_id,hot_sku_target_store_id,future_source_store_id
            ) b
        on a.sku_id = b.sku_id
    """
    hc.sql(sql_selected_sku_info).createOrReplaceTempView(
        "tb_selected_sku_info")

    # 生成最终的爆品比例结果
    sql_final_result = """
            select
                a.sku_id,
                a.dc_id,
                a.hot_sku_target_store_id,
                a.future_source_store_id,
                coalesce(round(b.sale_in_hot/b.sale_total,4),a.default_ratio) as hot_sku_ratio
            from tb_selected_sku_info a
            left join tb_result b
            on a.sku_id = b.sku_id
    """
    hc.sql(sql_final_result).createOrReplaceTempView("tb_final_result")

    # 把相关的结果整合到app_wil_hot_sku_ratio_result表,以dt,pid,ts为分区
    partition = "dt='" + str(param["date"]) + "', pid='" + str(
        param["pid"]) + "', ts='" + str(param["ts"]) + "'"
    table_output = 'app.app_wil_hot_sku_ratio_result'
    sql_save_sd_result = """
                insert overwrite table """ + table_output + """ partition (""" + partition + """)
                select
                    sku_id,
                    dc_id,
                    hot_sku_target_store_id,
                    future_source_store_id,
                    hot_sku_ratio
                from tb_final_result
            """
    hc.sql(sql_save_sd_result)
Example #23
0
from pyspark.sql.functions import rowNumber
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pyspark.mllib.linalg import VectorUDT
import csv
import string
import re
import numpy as np
from pyspark.mllib.stat import Statistics

conf = SparkConf().setAppName("NLP on Reddit Data")
conf.set("spark.driver.maxResultSize", "3g")
sc = SparkContext(conf=conf)

# notice here we use HiveContext(sc) because window functions require HiveContext
sqlContext = HiveContext(sc)
stopwordsList = stopwords.words('english')


def parse_csv(x):
    x = x.replace('\n', '')
    d = csv.reader([x])
    return next(d)


def isInt(s):
    try:
        int(s)
        return True
    except ValueError:
        return False
Example #24
0
# BAREBONES dayidx # only partitioned by DAY
#day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x]
# MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid)
#mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/')
#[day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data]


day_idx = '20160819'
uuid_idx = '14863360-65fc-11e6-a709-300ed5c5f881'

getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx)
geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx
geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx

sc = SparkContext()
hiveCtx = HiveContext(sc)

rows = hiveCtx.sql(getting_mappoint_data)

#regInfoRows = hiveCtx.sql('select a.*, b.region_capacity, b.ecor_capacity, b.prp, case b.peak_bitcap_mbps when null then 0 else b.peak_bitcap_mbps end peak_bitcap_mbps, case b.peak_flitcap_mfps when null then 0 else b.peak_flitcap_mfps end peak_flitcap_mfps from (select * from mapper.barebones where day=%s and latitude is not NULL and longitude is not NULL and ghost_services in ("W","S","KS","JW")) a join (select * from mapper.regioncapday where day=%s) b on a.region=b.region' % (day_idx, day_idx))
regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx))
geo_total_cap = hiveCtx.sql(geo_total_cap_query)
geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query)


# rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]]
region_mpginfo_pair = rows.map(lambda x: [[x.mpgid,
                                           x.lat,
                                           x.lon,
                                           x.country,
                                           x.mpgload,
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import HiveContext
import mysql.connector
import math

from pyspark.sql import Row, StructField, StructType, StringType, IntegerType

sc = SparkContext("local", "videoDifficulty")
sqlContext = HiveContext(sc)

timeFrameSize = 4
# timeFrameBuckets = [ [0,0.0] for x in range(0, int(math.ceil(duration/timeFrameSize)) ) ]


def get_partial_keys(dictionary, field_id, field_value):
    """
	takes a dictionary with a complex key tuple, a list of the id's of the fields in the tuple,
	and a tuple for the partial key for which to find the corresponding value from the dictionary
	input: 	field_id must be a list, eg: [1], [0,2,3], etc.
			field_value must be tuple, eg: ('a,1.0,1)
	"""
    field_id.sort()
    keylist = list()
    for key in dictionary.keys():
        if (
            tuple([key[x] for x in field_id]) == field_value
        ):  # Get the value of the partial key from the list of keys forthe dict, and
            keylist.append(key)  # add it if it matches with the key value being sought.
    return keylist
Example #26
0
        for record in records:
            writer.writerow(record)
        return [output.getvalue()]

    if not os.path.isdir('output/csv'):
        fullFileData.mapPartitions(writeRecords).saveAsTextFile('output/csv')
    sc.stop()

    print('\n\nCreate sql table with txt file')
    sc = SparkContext(conf=conf)
    url = 'https://raw.githubusercontent.com/apache/spark/master/examples/src/main/resources/kv1.txt'
    inputFile = 'input/kv1.txt'
    inputTable = 'myTable'
    if not os.path.isfile(inputFile):
        wget.download(url, inputFile)
    hiveCtx = HiveContext(sc)
    if not os.path.isdir(f'spark-warehouse/{inputTable}'):
        hiveCtx.sql(
            f"CREATE TABLE IF NOT EXISTS {inputTable} (key INT, value STRING)")
        hiveCtx.sql(f"LOAD DATA LOCAL INPATH '{inputFile}' INTO TABLE myTable")
    input = hiveCtx.sql(f"FROM {inputTable} SELECT key, value")
    print(f'myTable query: {input.rdd.take(10)}')
    print(f'myTable key: {input.rdd.map(lambda row: row[0]).take(10)}')
    sc.stop()

    print('\n\nRead json file into table')
    sc = SparkContext(conf=conf)
    hiveCtx = HiveContext(sc)
    tweets = hiveCtx.read.json('input/tweets.json')
    tweets.registerTempTable("tweets")
    results = hiveCtx.sql("SELECT user.name, text FROM tweets")
Example #27
0
# Createas a hive table and loads an input file into it
# For input you can use examples/src/main/resources/kv1.txt from the spark
# distribution
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputFile = sys.argv[2]
    inputTable = sys.argv[3]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Load some data into hive
    hiveCtx.sql(
        "CREATE TABLE IF NOT EXISTS " +
        inputTable +
        " (key INT, value STRING)")
    hiveCtx.sql(
        "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
class Unittest_HWM_Allocation_MulDays3(unittest.TestCase):
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)
        fpath = os.path.abspath(os.path.join(os.path.dirname(__file__),".."))
        with open(fpath + '/data_source/bookings_fully_overlapped.json') as bookings_source:
            self.bookings = json.load(bookings_source)
        with open(fpath + '/data_source/cfg.json') as cfg_source:
            self.cfg = json.load(cfg_source)
        today = '20180402'
        self.days = optimizer.util.get_days_from_bookings(today, self.bookings)
        self.sc = SparkContext.getOrCreate()
        self.hive_context = HiveContext(self.sc)
        self.schema = optimizer.util.get_common_pyspark_schema()

    def compare_two_dfs(self, pandas_df_expected, df_to_test_rows):
        df = self.hive_context.createDataFrame(df_to_test_rows, self.schema)
        df_allocated = optimizer.algo.hwm.hwm_allocation(df, self.bookings, self.days)
        pandas_df_allocated = df_allocated.select("*").toPandas()
        print(pandas_df_expected)
        print(pandas_df_allocated)

        return self.assertTrue(assert_frame_equal(pandas_df_expected, pandas_df_allocated, check_dtype=False) == None)

    def test_hwm_allocation_case1(self):
        # Print out the name of the current testing function to track logs
        # Testcase type: 1 booking bucket with 1 booking_id in ands
        # testcase 1: booking bucket ['20180402', ['b80'], ['b60'], {}, 3239]
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b80'], ['b60'], 3239, {'b80': 18}]

        df_to_test_rows =  = [(['20180402', ['b80'], ['b60'], {}, 3239])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case2(self):
        # bk_id: b11, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 11
        # bk_id: b12, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 12
        # Testcase type: 1 booking bucket with 2 booking_id in ands
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b120', 'b110'], [], 8900, {'b120': 2, 'b110': 2}]

        df_to_test_rows =  = [(['20180402', ['b120', 'b110'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case3(self):
        # Testcase type: 1 booking bucket with 3 booking_id in ands
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b60', 'b70', 'b100'], [], 8900, {'b60': 13, 'b70': 15, 'b100': 20}]

        df_to_test_rows =  = [(['20180402', ['b60', 'b70', 'b100'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case4(self):
        # Testcase type: 2 booking buckets with 4 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b80'], ['b60', 'b70', 'b90'], 3239, {'b80': 18}]
        pandas_df_expected.loc[1] = ['20180402', ['b60', 'b70'], ['b80', 'b90'], 8900, {'b60': 13, 'b70': 15}]

        df_to_test_rows =  = [(['20180402', ['b80'], ['b60', 'b70', 'b90'], {}, 3239]), (['20180402', ['b60', 'b70'], ['b80', 'b90'], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case5(self):
        # Testcase type: 3 booking buckets with 5 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b60', 'b70', 'b100', 'b110', 'b120'], ['b80', 'b90'], 
        8900, {'b60': 13, 'b70': 15, 'b100': 20, 'b110': 2, 'b120': 2}]

        df_to_test_rows =  = [(['20180402', ['b60', 'b70', 'b100', 'b110', 'b120'], ['b80', 'b90'],  {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case6(self):
        # bk_id: b13, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 130000
        # bk_id: b12, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 12
        # Testcase type: 3 booking buckets with 5 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b130', 'b120'], [], 
        8900, {'b130': 8900}]

        df_to_test_rows =  = [(['20180402', ['b130', 'b120'], [],  {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)
 
    def test_hwm_allocation_case7(self):
        # bk_id: b15, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 8900
        # bk_id: b14, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 8900
        # Testcase type: 3 booking buckets with 5 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b150', 'b140'], [], 8900, {'b150': 1780, 'b140': 1780}]

        df_to_test_rows =  = [(['20180402', ['b150', 'b140'], [],  {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case8(self):
        # bk_id: b17, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4450
        # bk_id: b16, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4450
        # Testcase type: 3 booking buckets with 5 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b170', 'b160'], [], 8900, {'b170': 890, 'b160': 890}]

        df_to_test_rows =  = [(['20180402', ['b170', 'b160'], [],  {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case9(self):
        # bk_id: b18, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4451
        # bk_id: b17, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4450
        # Testcase type: 3 booking buckets with 5 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b180', 'b170'], [], 8900, {'b180': 890, 'b170': 890}]

        df_to_test_rows =  = [(['20180402', ['b180', 'b170'], [],  {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case10(self):
        # Testcase type: 3 booking buckets with 5 bookings included.
        pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = ['20180402', ['b60', 'b70', 'b100', 'b120', 'b160', 'b170', 'b180'], ['b80', 'b90'], 
        8900, {'b120': 2, 'b100': 20, 'b160': 890, 'b170': 890, 'b180': 890, 'b70': 15, 'b60': 13}] # b6, b7, b10, b12, b16, b17, b18 have the same attributes.

        df_to_test_rows =  = [(['20180402', ['b60', 'b70', 'b100', 'b120', 'b160', 'b170', 'b180'], ['b80', 'b90'],  {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)
    return str_time




if __name__ == '__main__':
    if (len(sys.argv) != 1):
        print "Usage: spark-submit <Python Code File>"
        sys.exit(1)

    #App name which shows up in the Spark UI
    sc = SparkContext(appName='User Recommendation')


    #Context provides connection to Hive metastore
    sqlContext = HiveContext(sc)
    

    '''
    Pulling data out of Hive.  I created a relication of 'watson_bisum_purchases' table locally to test.
    '''
    
    rdd = sqlContext.sql("SELECT person_id,deal_id,aasm_state FROM watson_bisum_purchases")


    '''
    Creating datasets.  Formating the data and also creating sample datasets in order to create and test the model. 
    '''
    
    #Formating all the data using the 'parse_rating' method above
    all_data = rdd.map(parse_rating)
Example #30
0
import atexit
import platform
import py4j
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *

os.environ['PYSPARK_SUBMIT_ARGS'] = "--master yarn pyspark-shell"

sc = SparkContext(appName="pySpark_barker", pyFiles='')
atexit.register(lambda: sc.stop())
sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
sqlCtx = HiveContext(sc)

print("""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version %s
      /_/
""" % sc.version)
print("Using Python version %s (%s, %s)" %
      (platform.python_version(), platform.python_build()[0],
       platform.python_build()[1]))
print("SparkContext available as sc, %s available as sqlCtx." %
      sqlCtx.__class__.__name__)
print("TODO: add specific arg settings")
Example #31
0
	
	print(""" 
		Error: This program takes 2 arguments
		Usage: bin/spark-submit --master <spark-master> nrem.py <input dir> <output dir>
	""")
	sys.exit(1)

#Parallelism
partitions = 1
#Output Directories
matched_output = os.path.join(sys.argv[2],"matched")
eliminated_output = os.path.join(sys.argv[2], "eliminated")

conf = SparkConf().setAppName("Non Redundant Entity Matching")
sc = SparkContext(conf=conf)
sqlCtx = HiveContext(sc)

def attr_key(l):
	"""
		[obj, attr1, attr2, attr3 ...] -> [(attr1, obj), (attr2, obj), (attr3, obj) ...]
	"""
	a = []
	for attr in l[1:]:
		a.append((attr, l[0]))
	return a

"""
	Assuming input file(s) to be tsv, and first field to be object and rest of the fields as attributes 
"""
#Read input
inRDD = sc.textFile(sys.argv[1], partitions)
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row

conf = SparkConf()
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

sqlContext.sql("DROP TABLE people")
sqlContext.sql(
    "CREATE TABLE IF NOT EXISTS people(name STRING, age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY \",\""
)

rdd = sc.textFile("/user/cloudera/spark/people.txt") \
        .map(lambda x: x.split(",")) \
        .map(lambda x: Row( name=x[0], age=int(x[1]) ))

df = sqlContext.createDataFrame(rdd)
df.select(df.name, df.age).write.insertInto("people")

df2 = sqlContext.sql("select * from people").show()
    def _extract_logs(self):

        sql_context = HiveContext()
        logs = sql_context.table(self.source_table)

        return logs
Example #34
0
from pyspark.sql import HiveContext

# sc is an existing SparkContext.
sqlContext = HiveContext(sc)

# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files.

shiguang = sqlContext.read.json("/user/1707500/shiguangwang.json")

# root
#  |-- age: integer (nullable = true)
#  |-- name: string (nullable = true)
shiguang.printSchema()

# Register this DataFrame as a table.
shiguang.registerTempTable("shiguangwang")

# SQL statements can be run by using the sql methods provided by `sqlContext`.
sqlContext.sql("use sparktest")
teenager = sqlContext.sql("CREATE TABLE shiguang asSELECT * FROM shiguangwang")

Example #35
0
from pyspark.sql import HiveContext

from pyspark.mllib.classification import SVMWithSGD, SVMModel, LogisticRegressionWithSGD

from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import col, sum

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

from copy import deepcopy

sc = SparkContext()
sqlContext = HiveContext(sc)
qry = """SELECT *,white/population as white_percent,
         black/population as black_percent,
         asian/population as asian_percent,
         pacific_islander/population as pi_percent,
         other_race/population as other_race_percent,
         multiple_race/population as multiple_percent,
         hispanic/population as hispanic_percent
         FROM census_rest_success where days_open > 365"""

df = sqlContext.sql(qry)


## Lets train a Support Vector Classifier on this data
#CITATION:
#http://stackoverflow.com/questions/33900726/count-number-of-non-nan-entries-in-each-column-of-spark-dataframe-with-pyspark
def count_not_null(c):
    return sum(col(c).isNotNull().cast("integer")).alias(c)
Example #36
0
        ('spark.executor.instances', '10'),  # number of executors in total
        ('spark.yarn.am.memory', '10g')
    ])  # memory for spark driver (application master)
    spark = SparkSession.builder \
    .master("yarn") \
    .appName("name") \
    .enableHiveSupport() \
    .config(conf = conf) \
    .getOrCreate()

    return spark


spark = get_Spark()
spark_context = spark.sparkContext
hc = HiveContext(spark_context)

normaltimeFormat = "yyyyMMddHHmmss"


def run_cmd(args_list):
    proc = subprocess.Popen(args_list,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    proc.communicate()
    return proc.returncode


def build_schema(model):
    types = {
        "string": StringType(),
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# Initialize Spark
SparkContext.setSystemProperty("spark.executor.memory", "4g")
conf = SparkConf()
conf.set("spark.executor.instances", 20)
sc = SparkContext("yarn-client", "kdd99", conf=conf)
hc = HiveContext(sc)

kdd = hc.table("kdd99")

(trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42)
trainData.cache()
services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct()
testData = testData.join(services, testData.service == services.srvc)
# filter out any rows with a service not trained upon
testData.cache()

print "training set has " + str(trainData.count()) + " instances"
print "test set has " + str(testData.count()) + " instances"

# Build model
inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat")
inx2 = StringIndexer(inputCol="service", outputCol="service-cat")
inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
Example #38
0
def setupHive(sc):
    from pyspark.sql import HiveContext
    hc = HiveContext(sc)
    return hc
def main(args):
    """ Main code for relevance computation """
    
    start_time = time.time()
    
    # iq (code snippets that set below properties have been removed)
    driver   = 
    url      = 
    username = 
    password = 
    inputs = [driver, url, username, password]

    
    filename = str(args[0])
    if os.path.exists(filename):
        pass
    else:
        sys.exit("Input file %s not found" % filename)
    file = open(filename, 'r')
    for line in file:
        key, val = line.split(",")
        if str(key).strip() == "dbalias":
            dbalias = str(val).strip()
        elif str(key).strip() == "numpartitions":
            numpartitions = int(val)
        elif str(key).strip() == "datadir":
            datadir = str(val).strip()
        else:
            print("Invalid key not set: %s" % str(key))
    # Need to make sure that the datadir variable is set.
    try:
        print("datadir = '%s' " % datadir)
    except NameError:
        sys.exit("'datadir' variable not set. Check inputfile '%s'" 
                 % (datadir, filename))
            
    # Spark and Hive contexts
    conf = SparkConf()
    sc = SparkContext(conf = conf)
    sqlContext = HiveContext(sc)

    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic")
    if df is None: sys.exit("'traffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniquedata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic")
    if df is None: sys.exit("'fbtraffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniqueFBdata")
    df = None    

    statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\
                "from uniquedata ud left outer join uniqueFBdata ufd "\
                "on ud.loginid = ufd.loginid and ud.adid = ufd.adid"
    adswithFBjoined = sqlContext.sql(statement)
    adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()]
    adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType')

    sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data")

    statement = "Select loginid, count(loginid) as viewcount from data group by loginid"
    temp = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(temp, "viewdata")
    
    statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1"
    temp2 = sqlContext.sql(statement)
    
    sqlContext.sql("drop table data")
    sqlContext.registerDataFrameAsTable(temp2, "data")
        
    temp, temp2  = (None, None)

    df = utils.returnSparkDF(SQLContext(sc), inputs, "agent")
    if df is None: sys.exit("'agent' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "agentdata")

    statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type"
    unique_adid_per_loginid = sqlContext.sql(statement)
    unique_adid_per_loginid = unique_adid_per_loginid.drop('counter')
    sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata")
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite")
    if df is None: sys.exit("'favorite' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "favdata")
    df = None
    
    statement = "select * from data union all select * from agentdata union all select * from favdata"
    df2 = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(df2, "uniondata")
    df2 = None
    
    statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid"
    maxtype = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata")

    statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\
                "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\
                "and uniondata.Type = maxconversiondata.UserMaxConversion"
    data = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(data, "data")
       
    # Delete tables
    tables = ["uniquedata", "FBdata", "uniqueFBdata", "agentdata", 
              "favdata", "uniondata", "maxconversiondata"]
    for table in tables:
        sqlContext.sql("drop table if exists %s" % str(table))

    df = utils.returnSparkDF(SQLContext(sc), inputs, "adclassified")
    if df is None: sys.exit("'adclassified' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "addata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "geo")
    if df is None: sys.exit("'geo' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "geodata")
    df = None
    
    statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\
                "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\
                "from addata, geodata where addata.locationkey = geodata.locationkey"
    addata_for_join = sqlContext.sql(statement)

    statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\
                "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\
                "from addata, geodata where addata.locationkey = geodata.locationkey"
    addata_for_join = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(addata_for_join, "adtemp")

    statement = "select * from adtemp where PageViewCount < 10000"
    addata_for_join = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(addata_for_join, "addata_for_join")                   
    
    data, addata_for_join = (None, None)
    sqlContext.sql("drop table if exists addata")

    statement = "select a.*, b.AskingPrice, b.CollectiveDebt, b.PageViewCount, b.Municipal, b.CityPart "\
                "from data a, addata_for_join b where a.adid = b.adid"
    data = sqlContext.sql(statement)
    data = data.fillna(0)
    data = data.repartition(numpartitions)

    
    # Save the files as csv using spark-csv from databricks
    try:
        st = time.time()
        data.write.format("com.databricks.spark.csv").save(datadir, mode="overwrite", codec="bzip2")
        et = time.time()
        print("File save time was: %.2f mins." % ((et-st)/60.))
    except:
        sys.exit("Could not save files to dir '%s'. \n\nError = %s" % (datadir, sys.exc_info()[1]))
    finally:            
        end_time = time.time()    
        print("Spark ETL execution time = %.2f mins." % ((end_time-start_time)/60.))
    
    
    # Stop spark and continue using in-memory computation (another script)
    sc.stop()
        
    return
    return _out_value


def safe_concat(df, column, postfix):
    _col = sf.when(df[column] != ' ', sf.concat(df[column], sf.lit(postfix))).otherwise('').alias(column)
    return _col
#Spark setup
#Set up the Spark connection and Hive context.
conf = SparkConf() \
        .setAppName('Zeppelin') \
        .set('spark.cassandra.connection.host','10.41.49.1') \
        .set('spark.master', 'yarn-client')

sc = SparkContext.getOrCreate(conf=conf)

hive_context = HiveContext(sc)
#Define Hive queries
#By pushing a query down to Hive instead of loading the whole table into a DataFrame, we minimize the amount of bandwidth and memory that Spark needs to use.
ad_phone_query = " SELECT SAMAccountName" \
                "   , MobilePhone" \
                "   , telephoneNumber" \
                " FROM data_lake.ad_data_raw"

advance_phone_query = " SELECT lower(NetID) AS NetID" \
                     "   , ID_Number" \
                     "   , xsequence" \
                     "   , telephone_type_Code" \
                     "   , telephone_number" \
                     "   , extension" \
                     "   , country_code" \
                     "   , area_code" \
Example #41
0
"""transform.py"""

from pyspark import SparkContext
sc = SparkContext("local", "Transform")
from pyspark.sql import HiveContext

from pyspark.sql.types import *
import numpy as np

sqlContext = HiveContext(sc)

##########
# Util functions

def saveAsHiveTable(df, name):
    # Save it this way so that native HIVE can read the table as well
    df.registerTempTable("df")
    sqlContext.sql("drop table if exists " + name)
    sqlContext.sql("CREATE TABLE " + name + " AS SELECT * FROM df")

def getFloat(s):
    if s is None:
        return None
    try:
        f = float(s)
        return f
    except ValueError:
        return None

##########
Example #42
0
import os
from pyspark import SparkContext
from pyspark.sql import SQLContext, HiveContext
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql import functions as F
import autofe
from autofe.window import window_aggregate, partitionby_aggregate
from autofe.aggregate import groupby_aggregate
from autofe.functions import spark_functions
sc = SparkContext.getOrCreate()
sql_context = HiveContext(sc)

data_path = os.path.join('luojisiwei.parquet')
t = sql_context.read.format('parquet').load(data_path)
print(t.show(5))
print(t.dtypes)
print(t.head())
print(t.first())
print(t.take(2))
print(t.schema)
Example #43
0
class StravaLoader(object):

    def __init__(self, 
                 data_source='local', 
                 activity_directory='strava-activities-subset',
                 s3bucket='larsbk',
                 athletes=None,
                 activity_types=[
                    'Ride',
                    'Run',
                    'NordicSki'
                 ],
                 sc=None,
                 hiveContext=None,
                 conf=(SparkConf().setAppName('Strava analysis')),
                 filter_bug_inducing_rows=True
                 ):

        ''' Initialize Strava Analysis object'''


        # INPUT PARAMETERS

        self.athletes = athletes # Athletes to analyze (optional)
        self.activity_types = activity_types # Activity_types to consider (default)
        self.filter_bug_inducing_rows = filter_bug_inducing_rows


        # CONFIGURE SPARK

        if sc != None and hiveContext != None: # Both contexts were supplied by user
            print 'Info: Using supplied SparkContext and HiveContext'
            self.sc = sc
            self.hiveContext = hiveContext

        else: # Initialize new contexts
            print 'Info: Intitializing SparkContext and hiveContext from (default) conf'
            self.sc = SparkContext(conf=conf)
            self.hiveContext = HiveContext(self.sc)

        self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema
        self.df = None # Empry DataFrame to be populated later


        # CONFIGURE DATA SOURCE

        data_root_path = {
                's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 
                'local': './%s/' % activity_directory
        }
        
        if data_source not in data_root_path.keys(): # Check if data source is valid 
            raise Exception(('Unrecognized data source %s. '
                             'Supported sources: "%s".') \
                             % '", "'.join(data_root_path.keys()))
        
        self.data_source = data_source # This is a valid data source
        self.path = data_root_path[data_source] # This is the path to the data


        # (S3 SPECIFIC STUFF)

        if data_source == 's3':

            # Get a list of files in he activity_directorys
            bucket = boto3.resource('s3').Bucket(s3bucket) 
            objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory)
            files = [obj.key for obj in objects] 

            # Make set of observed combinations of athlete and activity_type
            athlete_and_type = set([]) # Empty set to populate
            fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern
            for fname in files:
                match = re.match(activity_directory+'/gpx'+fpattern, fname)
                if match:
                    athlete_and_type.add((match.group(1), match.group(2)))

            self.s3_athlete_and_type = athlete_and_type # Save set for later use

        pass


    def _get_athlete_directories(self):
        '''
        Look for athlete directories in data_root_path \
        and update self.athletes
        '''

        if self.data_source in ['local']:

            self.athletes = [
                directory for directory in os.listdir(self.path+'gpx/')
                if re.match('^[\w-]+$', directory)
            ]

        else:
            print ('Warning: Automatic directory/athlete detection not yet supported for '
                   'data source %s. Using: "akrogvig", "lkrogvig", "brustad"') \
                   % self.data_source

            self.athletes = ['akrogvig', 'lkrogvig', 'brustad']

        pass


    def _activities_exist(self, athlete, activity_type):
        '''
        Checks if there exists activities of type <activity_type> for athlete <athlete>, 
        returns a boolean value
        '''

        # Check local directory with glob
        if self.data_source == 'local':
            return glob.glob(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))

        # Check if combination exists by using previously compiled sets
        elif self.data_source == 's3':
            return ((athlete, activity_type) in self.s3_athlete_and_type)

    def _load_dataset(self):
        '''
        Loads strava activities from source to DataFrame self.df
        '''

        # Get athlete list if not already set
        if not self.athletes:
            self._get_athlete_directories()

        # Initialize empty dataset
        self.df = self.hiveContext.createDataFrame(
            self.sc.emptyRDD(),
            self.schema
        )

        for athlete in self.athletes:
            for activity_type in self.activity_types:
        
                # Check that there are files of that type (or else .load fails)
                if self._activities_exist(athlete, activity_type):

                    # Read data
                    dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \
                                    .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \
                                    .schema(self.schema) \
                                    .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))
                
                    dfadd = dfadd.withColumn('athlete', lit(athlete)) \
                                 .withColumn('activity_type', lit(activity_type))
                
                    self.df = self.df.unionAll(dfadd)

        if self.filter_bug_inducing_rows:
            self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull())

        pass


    def derive_schema(self):
        '''
        Loads all data in self.path and derives the schema, saves with pickle to "schema.p"
        '''

        df = self.hiveContext.read.format('com.databricks.spark.xml') \
                    .options(rowTag='trkpt') \
                    .load(self.path+'gpx/*')

        df = df.withColumn('athlete',lit(None).cast(StringType())) \
               .withColumn('activity_type',lit(None).cast(StringType()))

        df.printSchema()
        pickle.dump(df.schema, open("schema.p", "wb"))

        pass


    def get_dataset(self):
        '''
        Returns strava activity dataset
        '''
        if not self.df:
            self._load_dataset()
        
        return self.df
Example #44
0
        e = error_m(expected, predicted)[0]
        print(e * 100)
        if e < 0:
            e = 0
        errs.append(e)

    print(sum(errs) / (len(errs) * 1.0) * 100)


if __name__ == '__main__':

    cfg = {
        'log_level': 'warn',
        'trainready_table':
        'dlpm_111021_no_residency_no_mapping_trainready_test_12212021',
        'dist_table':
        'dlpm_111021_no_residency_no_mapping_tmp_distribution_test_12212021',
        'serving_url':
        'http://10.193.217.126:8503/v1/models/dl_test_1221:predict',
        'max_calls': 4,
        'model_stat_table':
        'dlpm_111021_no_residency_no_mapping_model_stat_test_12212021',
        'yesterday': 'WILL BE SET IN PROGRAM'
    }

    sc = SparkContext.getOrCreate()
    hive_context = HiveContext(sc)
    sc.setLogLevel(cfg['log_level'])

    run(cfg=cfg, hive_context=hive_context)
Example #45
0
MYSQL_USERNAME = '******'
MYSQL_PASSWORD = '******'
MYSQL_CONNECTION_URL = "jdbc:mysql://localhost:3306/employees?user="******"&password="******"spark://Box.local:7077")
conf.setAppName("MySQL_import")
conf.set("spark.executor.memory", "1g")

# Initialize a SparkContext and SQLContext
sc = SparkContext(conf=conf)
sql_ctx = SQLContext(sc)

# Initialize hive context
hive_ctx = HiveContext(sc)

# Source 1 Type: MYSQL
# Schema Name  : EMPLOYEE
# Table Name   : EMPLOYEES
# + --------------------------------------- +
# | COLUMN NAME| DATA TYPE    | CONSTRAINTS |
# + --------------------------------------- +
# | EMP_NO     | INT          | PRIMARY KEY |
# | BIRTH_DATE | DATE         |             |
# | FIRST_NAME | VARCHAR(14)  |             |
# | LAST_NAME  | VARCHAR(16)  |             |
# | GENDER     | ENUM('M'/'F')|             |
# | HIRE_DATE  | DATE         |             |
# + --------------------------------------- +
df_employees = sql_ctx.load(
Example #46
0
    将Hive的查询作为Spark的任务提交到Spark集群上进行计算。通过该项目,
    可以提高Hive查询的性能,同时为已经部署了Hive或者Spark的用户提供了更加灵活的选择,
    从而进一步提高Hive和Spark的普及率。
"""

import sys
reload(sys)
sys.setdefaultencoding('utf-8')


from pyspark import SparkContext
from pyspark.sql import HiveContext

if __name__ == '__main__':
    sc = SparkContext(appName = "sql_insert")
    sqlContext = HiveContext(sc)
    # driver = "com.mysql.jdbc.Driver"
    dff = sqlContext.read.format("jdbc").options(url="jdbc:mysql://192.168.32.1:3306/testdjango?user=root"
                                                     "&password=root&useUnicode=true&characterEncoding=UTF-8"
                                                     "&zeroDateTimeBehavior=convertToNull", dbtable="t_userinfo").load()
    dff.registerTempTable('t_userinfo')

    dft = sqlContext.read.format("jdbc").options(url="jdbc:mysql://192.168.32.1:3306/testdjango?user=root"
                                                     "&password=root&useUnicode=true&characterEncoding=UTF-8"
                                                     "&zeroDateTimeBehavior=convertToNull", dbtable="t_userinfo2").load()
    dft.registerTempTable('t_userinfo2')
    ds = sqlContext.sql('select username,password,email from t_userinfo')
    print ds.collect()
    # ds 是datafram 类型 collect()转成 Row 可以遍历
    # for i in ds.collect():
    #     # print i.username
Example #47
0
if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument('config_file')
    args = parser.parse_args()

    # Load config file
    with open(args.config_file, 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
        resolve_placeholder(cfg)

    cfg_log = cfg['log']
    cfg = cfg['pipeline']

    sc = SparkContext()
    hive_context = HiveContext(sc)
    sc.setLogLevel(cfg_log['level'])

    # save table as tfrecords
    path = cfg['tfrecords']['tfrecords_hdfs_path']
    input_table_name = cfg['normalization']['output_table_name']

    command = """
            SELECT * FROM {}
            """.format(input_table_name)

    df = hive_context.sql(command)
    df.write.format("tfrecords").option("recordType",
                                        "Example").mode('overwrite').save(path)

    sc.stop()
Example #48
0
    vectorT = row['words_title']
    vectorD = row['words_desc']
    data = row.asDict()
    data['words'] = vectorT + vectorD
    w=[]
    for word in data['words']:
        w += words(word)
    data['wordsF'] = w
    newRow = Row(*data.keys())
    newRow = newRow(*data.values())
    return newRow


sc = SparkContext.getOrCreate()

sqlContext = HiveContext(sc)
print "###############"
# READ data
data = sqlContext.read.format("com.databricks.spark.csv").\
    option("header", "true").\
    option("inferSchema", "true").\
    load("train.csv").repartition(100)
print "data loaded - head:"
print data.head()
print "################"

attributes = sqlContext.read.format("com.databricks.spark.csv").\
    option("header", "true").\
    option("inferSchema", "true").\
    load("attributes.csv").repartition(100)
Example #49
0
from pyspark import SparkContext
sc = SparkContext("local", "best_hospitals")

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

# Select the top 10 hospital by average avgscore
# Please note that we filter out those hospital not qualified for evaluation
df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \
from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \
where Q.normalizedscore is not null and H.qualified = true \
group by Q.providerid \
order by avgscore DESC").limit(10)

# Join with hospitals_qualified to get the hospital name and state
# Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-(
df_hospitals = sqlContext.table("hospitals_qualified")
df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\
    select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore)

df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc())

# Save it as a table
df_top10_hospitals_full.registerTempTable("df")
sqlContext.sql("drop table if exists top_10_hospitals")
sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df")

print
print "Top 10 hospitals"
print
rank = 1
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, StructType, StructField, StringType
import re

conf = SparkConf().setAppName("spark_sql_regex_specify_schema")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(["row1_col1 row1_col2 row1_col3",
                         "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"])

pattern = re.compile("(.*) (.*) (.*)")


def parse(line):
    matcher = pattern.match(line)

    if matcher:
        return matcher.groups()
    else:
        return None

columns = source.map(parse).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([StructField("col1", StringType(), False), StructField(
Example #51
0
# A simple hive demo. If you do not have a table to load from look run MakeHiveTable.py
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "Error usage: LoadHive [sparkmaster] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputTable = sys.argv[2]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Query hive
    input = hiveCtx.hql("FROM " + inputTable + " SELECT key, value")
    data = input.map(lambda x: x['key'] * x['key'])
    result = data.collect()
    for element in result:
        print "Got data " + str(element)
    sc.stop()
    print "Done!"
Example #52
0
        tfidf_cosine.write.saveAsTable('{0}.tfidf_sim_all_on_product_name'.format(database_name),mode=modex)
        tfidf_cosine=sqlContext.sql('select * from {0}.tfidf_sim_all_on_product_name'.format(database_name))
        # topn选取
        windowy=Window.partitionBy('target_word').orderBy(F.desc('tfidf_similarity'))
        tfidf_cosine=tfidf_cosine.withColumn('rankx',F.row_number().over(windowy)).filter(F.col('rankx')<=topn).withColumn('rk_start',F.lit(rk_start)).drop('rankx')
        #tfidf_cosine.count() ;     print(dtm.now()-t1) 
        # 逐批写入表
        tfidf_cosine.write.saveAsTable('{0}.tfidf_sim_on_product_name'.format(database_name),mode=modex)
        rk_start=rk_start+rk_step
        print(dtm.now()-t1) 
    return tfidf_cosine

if __name__=='__main__':
    confx=SparkConf().setAppName('4_word_semantic_similarity_on_prod_name')
    sc=SparkContext(conf=confx)
    sqlContext=HiveContext(sc)
    sc.setLogLevel("WARN")

    ## 4 tfidf 相似度
    # 4.1 生成tfidf 向量,并存储 : 先在独立的py文件中运行了,以便释放内存
      
    # 4.2 计算相似度 建议这里重新开一个 py文件,这样可以释放内存
    tfidf_id_vec_df=sqlContext.sql('select * from {0}.term_id_tfidf_vec_in_cat'.format(database_name))
    tk_id=sqlContext.sql('select * from {0}.term_id_name'.format(database_name))
    
    tfidf_vec_df=tk_id.join(tfidf_id_vec_df,'term_id','inner').select('term_name','tfidf_vec','tfidf_vec_pos','rankx')
    tfidf_vec=tfidf_vec_df.toPandas()
    #import sys
    #sys.getsizeof(tfidf_vec)/1024.0/1024
    sparse_val=params['tfidf']['na_val']
    topn=params['tfidf']['topn']
Example #53
0
    "/var/test/spark-2.0.1/python/lib/pyspark.zip,/var/test/spark-2.0.1/python/lib/py4j-0.10.3-src.zip",
).config(
    "spark.yarn.dist.archives",
    "/var/test/spark-2.0.1/R/lib/sparkr.zip#sparkr,hdfs:////user/grp_gdoop_admin/anaconda/anaconda2_env.zip#ANACONDA",
).config("spark.executorEnv.PYTHONPATH",
         "pyspark.zip:py4j-0.10.3-src.zip").config(
             "spark.yarn.executor.memoryOverhead", "8192").config(
                 "spark.yarn.queue", "marketing_datascience").config(
                     "spark.sql.warehouse.dir",
                     warehouse_location).enableHiveSupport().getOrCreate())

spark.conf.set("spark.sql.orc.filterPushdown", "true")

sc = SparkContext.getOrCreate()

sqlContext = HiveContext(sc)

# paragraph 2 email function and dates


def send_mail(message, to, subject):
    msg = MIMEText(message)
    msg["From"] = "[email protected]"
    msg["To"] = to
    msg["Subject"] = subject
    p = Popen(["/usr/sbin/sendmail", "-t", "-oi"], stdin=PIPE)
    p.communicate(msg.as_string())


today = datetime.date.today()
edate = datetime.date.today() - relativedelta(days=2)
Example #54
0
    # Load config file
    with open(args.config_file, 'r') as ymlfile:
        cfg = yaml.safe_load(ymlfile)

    # load zookeeper paths and start kazooClient to load configs.
    cfg_zk = cfg["zookeeper"]
    zk = KazooClient(hosts=cfg_zk["zookeeper_hosts"])
    zk.start()

    # load gucdocs and keywords from the hive.
    gucdocs_table, stat = zk.get(cfg_zk["gucdocs_loaded_table"])
    keywords_table, stat = zk.get(cfg_zk["keywords_table"])

    sc = SparkContext.getOrCreate()
    sc.setLogLevel('WARN')
    hive_context = HiveContext(sc)

    command = "SELECT * FROM {}"
    df_gucdocs = hive_context.sql(command.format(gucdocs_table))

    command = "SELECT DISTINCT keyword FROM {}"
    df_keywords = hive_context.sql(command.format(keywords_table))
    keywords = [keyword['keyword'] for keyword in df_keywords.collect()]

    # load all the es configs for saving gucdocs to es.
    es_host, stat = zk.get(cfg_zk["es_host"])
    es_port, stat = zk.get(cfg_zk["es_port"])
    es_index_prefix, stat = zk.get(cfg_zk["es_index_prefix"])
    es_index = es_index_prefix + '-' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    es_type, stat = zk.get(cfg_zk["es_type"])
    es_starting_day, stat = zk.get(cfg_zk["es_starting_day"])
                w.blowing_out = self.isBlowingOut(w.windBearing, float(game.center_azimuth))
                try:
                    w.ozone = val.ozone
                except Exception:
                    w.ozone = self.domeVal['ozone']
                print "w=", w
            updates.append(w)

        self.saveWeather(updates)
        print "updates=", updates

if __name__ == '__main__':
    print "Starting.", datetime.now()
    sc = SparkContext()
    #sqlContext = SQLContext(sc)
    sqlContext = HiveContext(sc)

    games = sqlContext.read.parquet(CreateStatsRDD.rddDir + "/" + Games.table_name + ".parquet")
    games.registerTempTable("games")
    games.cache()
    print "games=", games
    print games.take(2)

    stadium = sqlContext.load(source="com.databricks.spark.csv", header="true", path = SRCDIR + "/stadium.csv")
    stadium.registerTempTable("stadium")
    stadium.cache()
    print "stadium=", stadium.take(2)
    
    weather = UpdateWeather(sc, sqlContext, games, stadium)
    weather.update()
# A simple demo for working with SparkSQL and Tweets
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql(
        "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row: row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize(
        [Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("happy_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql(
        "SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
Example #57
0
# this is the deprecated equivalent of ADD_JARS
add_files = None
if os.environ.get("ADD_FILES") is not None:
    add_files = os.environ.get("ADD_FILES").split(',')

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

sc = SparkContext(pyFiles=add_files)
atexit.register(lambda: sc.stop())

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
    sqlContext = HiveContext(sc)
except py4j.protocol.Py4JError:
    sqlContext = SQLContext(sc)
except TypeError:
    sqlContext = SQLContext(sc)

# for compatibility
sqlCtx = sqlContext

print("""Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version %s
      /_/
""" % sc.version)
Example #58
0
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pandas import *

sc = SparkContext("local[*]", "RentalData")

ssc = StreamingContext(sc, 5)

ss = SparkSession.builder \
 .appName(sc.appName) \
 .config("spark.sql.warehouse.dir",
             "/user/hive/warehouse") \
 .config("hive.metastore.uris",
                      "thrift://localhost:9083") \
 .enableHiveSupport() \
 .getOrCreate()

hive_context = HiveContext(sc)
df = hive_context.table("default.rentals")

df.toPandas().to_csv('mycsv.csv')

print(rentals)

ssc.start()
ssc.awaitTermination()

##spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4 make_map.py
Example #59
0
    FloatType,
    DoubleType,
    DecimalType,
    StringType,
    BooleanType,
    TimestampType,
    DateType,
    ArrayType,
    MapType,
)

conf = SparkConf().setAppName("spark_sql_datatype")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(
    [
        (
            int("127"),
            int("32767"),
            int("2147483647"),
            long("9223372036854775807"),
            float("1.1"),
            float("2.2"),
            Decimal("3.3"),
            "str",
            bool(0),
            datetime(2015, 9, 22, 9, 39, 45),
            date(2015, 9, 22),
Example #60
0
def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)


def get_sql_context_instance(spark_context):

    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)

    return globals()['sqlContextSingletonInstance']


from pyspark.sql.types import *
schema = StructType([])
sql_context = HiveContext(sc)
empty = sql_context.createDataFrame(sc.emptyRDD(), schema)


def process_rdd(_, rdd):
    try:
        # Get spark sql singleton context from the current context
        sql_context = get_sql_context_instance(rdd.context)

        # convert the RDD to Row RDD
        row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
        # create a DF from the Row RDD
        hashtags_df = sql_context.createDataFrame(row_rdd)
        # Register the dataframe as table
        hashtags_df.registerTempTable("hashtags")
        # get the top 10 hashtags from the table using SQL and print them