def get_context_test(): conf = SparkConf() sc = SparkContext('local[1]', conf=conf) sql_context = HiveContext(sc) sql_context.sql("""use fex_test""") sql_context.setConf("spark.sql.shuffle.partitions", "1") return sc, sql_context
def table_schema_from_spark(hcat_table_name): #returns schema of table with this database.name in hcatalog # (spark-workaround as long as hcatweb api is not available...) # initialize spark import findspark findspark.init() import pyspark from pyspark.sql import HiveContext sc_conf = pyspark.SparkConf() #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*') #sc_conf.set('spark.master','yarn-client') sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf) hc = HiveContext(sc) hive_schema = hc.table(hcat_table_name).schema.jsonValue() print hive_schema sc.stop() table_schema = {'columns':{}} col_sequence = 0 for field in hive_schema['fields']: table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']} col_sequence += 1 return table_schema
def main(): sc = SparkContext() hc = HiveContext(sc) df = hc.sql("""{{sql}}""") df_writer = DataFrameWriter(df) df_writer.saveAsTable(name='{{tableName}}', format='json', mode='overwrite', path='s3://data/{{tableName}}')
def get_context(): conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="__file__", conf=conf) sql_context = HiveContext(sc) sql_context.sql("""use fex""") sql_context.setConf("spark.sql.shuffle.partitions", "32") return sc, sql_context
def gen_report_table(hc,curUnixDay): rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6]))) HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor") #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_indoor order by entityid,clientmac,utoday" df_id_stat=hc.sql(sql) df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_id_mm df_min_max ,to caculate firtarrival and last arrival df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY] df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon) df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth") #newly added part for indoors7 and indoors30 based on current date df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0)) df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0)) df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30") hc.sql("drop table if exists df_indoor_fin") df_indoor_fin.write.saveAsTable("df_indoor_fin") rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5]))) HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow") # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday sql="select entityid,clientmac,utoday,UFIRSTDAY," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_flow order by entityid,clientmac,utoday" df_fl_stat=hc.sql(sql) df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_fl_mm df_min_max ,to caculate firtarrival and last arrival df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY] df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon) df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth") hc.sql("drop table if exists df_flow_fin") df_flow_fin.write.saveAsTable("df_flow_fin")
def query12_no(query_name, conf=None): sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) # SQL statements can be run by using the sql methods provided by sqlContext sql = "use tpcds_text_db_1_50" _ = sqlContext.sql(sql) output = execute_sql(query_name, sqlContext) output['describe'] = output['output'].describe().show() sc.stop() return output
def query12_input(query_name, conf=None, output_persist=False): sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) # SQL statements can be run by using the sql methods provided by sqlContext sql = "use tpcds_text_db_1_50" _ = sqlContext.sql(sql) # web_sales_sql = "select * from web_sales" # web_sales = sqlContext.sql(web_sales_sql) # web_sales.persist() # web_sales.registerAsTable("web_sales") # item_sql = "select * from item" # item = sqlContext.sql(item_sql) # item.persist() # item.registerAsTable("item") # date_dim_sql = "select * from date_dim" # date_dim = sqlContext.sql(date_dim_sql) # date_dim.persist() # date_dim.registerAsTable("date_dim") sqlContext.cacheTable("web_sales") sqlContext.cacheTable("item") sqlContext.cacheTable("date_dim") # discard the first query output = execute_sql(query_name, sqlContext, output_persist) # check the re-run statistics output = execute_sql(query_name, sqlContext) output['describe'] = output['output'].describe().show() sc.stop() return output
def _test(): import doctest import os import tempfile from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext, HiveContext import pyspark.sql.readwriter os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['hiveContext'] = HiveContext._createForTesting(sc) globs['df'] = \ globs['sqlContext'].read.format('text').stream('python/test_support/sql/streaming') (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def __init__(self, sc, debug=False): self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/" self.sc = sc # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive self.hive_cxt = HiveContext(sc) self.sql_cxt = SQLContext(sc) if debug: print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt) print sc._conf.getAll()
def run(self): sc = SparkContext("local", "Course Activity") #sqlHC is the SQLHiveContext sqlHC = HiveContext(sc) lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime, eventType,eventName,eventNo from logdata where eventType not in ('enrollment','instructor','admin') and lmsUserId is not NULL and courseName is not NULL and eventNo is not NULL limit 10""") maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4])) reduceRDD=maplvl1.reduceByKey(lambda a,b : a+b) with self.output().open('w') as out_file: for line in reduceRDD.collect(): out_file.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
def ch9_sql(): # Import Spark SQL from pyspark.sql import HiveContext, Row # Or if you can't include the hive requirements from pyspark.sql import SQLContext, Row hiveCtx = HiveContext(sc) input_file = hiveCtx.read.json("testweet.json") # Register the input_file schema RDD input_file.registerTempTable("tweets") # Select tweets based on the retweetCount topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10""") topTweetText = topTweets.map(lambda row: row.text) topTweetText.collect() topTweets.schema hiveCtx.cacheTable("tweets")
def run(inpath, outpath, mode='append'): gc.disable() print("===== Checking if Log Exists =====") check_log(inpath) print("===== Pass Log Checking =====") # initial SparkContext conf = SparkConf().setAppName("Forgate Log Parser") sc = SparkContext(conf=conf) sqlCtx = HiveContext(sc) start_time = time.time() print("===== INPUT FILE PATH: %s =====" % (str(inpath))) print("===== OUTPUT FILE PATH: %s =====" % (str(outpath))) print("===== %s Reading Data From HDFS" % (now())) distFile = sc.textFile(inpath) cnt_raw = distFile.count() print("===== Count of Input Data: %s =====" % (str(cnt_raw))) print("===== %s Parsing Data" % (now())) parsedData = parse_data(sc, distFile) print("===== Count of Parsed Data: %s =====" % (str(parsedData.count()))) print("===== %s Saving Data" % (now())) jsonData = sqlCtx.jsonRDD(parsedData) old_col=['time','date'] new_col=['time_','dt'] jsonData = rename_column(jsonData, old_col, new_col) jsonData.write.partitionBy('dt').parquet(outpath, mode=mode) print("===== %s Checking Data" % (now())) confirm_row(sqlCtx, outpath) write_log(inpath) print("---Total took %s seconds ---" % (time.time() - start_time)) sc.stop() gc.enable()
without this change they would appear as u'Foo' and 'Foo' in the final key string. Although python doesn't care about this difference, hadoop does, and will bucket the values separately. Which is not what we want. """ # TODO: refactor this into a utility function and update jobs # to always UTF8 encode mapper keys. if len(values) > 1: return tuple([value.encode('utf8') for value in values]) else: return values[0].encode('utf8') sc = SparkContext("local", "Course Activity") #sqlHC is the SQLHiveContext sqlHC = HiveContext(sc) lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime, eventType,eventName,eventNo from logdata where eventType not in ('enrollment','instructor','admin') and lmsUserId is not NULL and courseName is not NULL and eventNo is not NULL limit 100""") maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4])) for linet in maplvl1.collect(): print linet reduceRDD = maplvl1.reduceByKey(lambda a, b : a + b)
from pyspark.sql import HiveContext from pyspark.mllib.classification import SVMWithSGD, SVMModel, LogisticRegressionWithSGD from pyspark.mllib.regression import LabeledPoint from pyspark.sql.functions import col, sum from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.evaluation import MulticlassMetrics from copy import deepcopy sc = SparkContext() sqlContext = HiveContext(sc) qry = """SELECT *,white/population as white_percent, black/population as black_percent, asian/population as asian_percent, pacific_islander/population as pi_percent, other_race/population as other_race_percent, multiple_race/population as multiple_percent, hispanic/population as hispanic_percent FROM census_rest_success where days_open > 365""" df = sqlContext.sql(qry) ## Lets train a Support Vector Classifier on this data #CITATION: #http://stackoverflow.com/questions/33900726/count-number-of-non-nan-entries-in-each-column-of-spark-dataframe-with-pyspark def count_not_null(c): return sum(col(c).isNotNull().cast("integer")).alias(c)
def run(cfg, yesterday, model_name, model_version, serving_url): # os.environ[ # 'PYSPARK_SUBMIT_ARGS'] = '--jars /home/reza/eshadoop/elasticsearch-hadoop-6.5.2/dist/elasticsearch-hadoop-6.5.2.jar pyspark-shell' es_write_conf = {"es.nodes": cfg['es_host'], "es.port": cfg['es_port'], "es.resource": cfg['es_predictions_index']+'/'+cfg['es_predictions_type'], "es.batch.size.bytes": "1000000", "es.batch.size.entries": "100", "es.input.json": "yes", "es.mapping.id": "uckey", "es.nodes.wan.only": "true", "es.write.operation": "upsert"} sc = SparkContext() hive_context = HiveContext(sc) forecaster = Forecaster(cfg) sc.setLogLevel(cfg['log_level']) # Reading the max bucket_id bucket_size = cfg['bucket_size'] bucket_step = cfg['bucket_step'] factdata = cfg['factdata'] distribution_table = cfg['distribution_table'] norm_table = cfg['norm_table'] traffic_dist = cfg['traffic_dist'] model_stats = get_model_stats(cfg, model_name, model_version) # Read dist command = """ SELECT DIST.uckey, DIST.ratio, DIST.cluster_uckey, DIST.price_cat FROM {} AS DIST """.format(distribution_table) df_dist = hive_context.sql(command) # Read norm table # DataFrame[uckey: string, ts: array<int>, p: float, a__n: float, a_1_n: float, a_2_n: float, a_3_n: float, a_4_n: float, a_5_n: float, a_6_n: float, t_UNKNOWN_n: float, t_3G_n: float, t_4G_n: float, t_WIFI_n: float, t_2G_n: float, g__n: float, g_g_f_n: float, g_g_m_n: float, g_g_x_n: float, price_cat_1_n: float, price_cat_2_n: float, price_cat_3_n: float, si_vec_n: array<float>, r_vec_n: array<float>, p_n: float, ts_n: array<float>] command = """ SELECT uckey AS cluster_uckey, price_cat, a__n,a_1_n,a_2_n,a_3_n,a_4_n,a_5_n,a_6_n, t_UNKNOWN_n,t_3G_n,t_4G_n,t_WIFI_n,t_2G_n, g__n, g_g_f_n, g_g_m_n, g_g_x_n, price_cat_1_n, price_cat_2_n, price_cat_3_n, si_vec_n, r_vec_n FROM {} """.format(norm_table) df_norm = hive_context.sql(command) # df_norm = df_norm.groupBy('cluster_uckey', 'a__n', 'a_1_n', 'a_2_n', 'a_3_n', 'a_4_n', 'a_5_n', 'a_6_n', 't_UNKNOWN_n', # 't_3G_n', 't_4G_n', 't_WIFI_n', 't_2G_n', 'g__n', 'g_g_f_n', 'g_g_m_n', 'g_g_x_n', 'si_vec_n').count().drop('count') # create day_list from yesterday for train_window duration = model_stats['model']['duration'] day = datetime.strptime(yesterday, '%Y-%m-%d') day_list = [] for _ in range(0, duration): day_list.append(datetime.strftime(day, '%Y-%m-%d')) day = day + timedelta(days=-1) day_list.sort() df_prediction_ready = None df_uckey_cluster = None start_bucket = 0 while True: end_bucket = min(bucket_size, start_bucket + bucket_step) if start_bucket > end_bucket: break # Read factdata table command = """ SELECT FACTDATA.count_array, FACTDATA.day, FACTDATA.hour, FACTDATA.uckey FROM {} AS FACTDATA WHERE FACTDATA.bucket_id BETWEEN {} AND {} """.format(factdata, str(start_bucket), str(end_bucket)) start_bucket = end_bucket + 1 df = hive_context.sql(command) # [Row(count_array=[u'1:504'], day=u'2019-11-02', hour=2, uckey=u'magazinelock,04,WIFI,g_m,1,CPM,78', hour_price_imp_map={2: [u'1:504']})] df = df.withColumn('hour_price_imp_map', expr("map(hour, count_array)")) # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', hour_price_imp_map_list=[{15: [u'3:3']}, {7: [u'3:5']}, {10: [u'3:3']}, {9: [u'3:1']}, {16: [u'3:2']}, {22: [u'3:11']}, {23: [u'3:3']}, {18: [u'3:7']}, {0: [u'3:4']}, {1: [u'3:2']}, {19: [u'3:10']}, {8: [u'3:4']}, {21: [u'3:2']}, {6: [u'3:1']}])] df = df.groupBy('uckey', 'day').agg( collect_list('hour_price_imp_map').alias('hour_price_imp_map_list')) # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', day_price_imp=[u'3:58'])] df = df.withColumn('day_price_imp', udf( sum_count_array, ArrayType(StringType()))(df.hour_price_imp_map_list)).drop('hour_price_imp_map_list') # [Row(uckey=u'native,68bcd2720e5011e79bc8fa163e05184e,4G,g_m,2,CPM,19', day=u'2019-11-02', day_price_imp=[u'3:58'], day_price_imp_map={u'2019-11-02': [u'3:58']})] df = df.withColumn('day_price_imp_map', expr( "map(day, day_price_imp)")) # [Row(uckey=u'native,z041bf6g4s,WIFI,g_f,1,CPM,71', day_price_imp_map_list=[{u'2019-11-02': [u'1:2', u'2:261']}, {u'2019-11-03': [u'2:515']}])]) df = df.groupBy('uckey').agg(collect_list( 'day_price_imp_map').alias('day_price_imp_map_list')) # [Row(uckey=u'native,z041bf6g4s,WIFI,g_f,1,CPM,71', day_price_imp_map_list=[{u'2019-11-02': [u'1:2', u'2:261']}, {u'2019-11-03': [u'2:515']}], ratio=0.09467455744743347, cluster_uckey=u'892', price_cat=u'1')] df = df.join(df_dist, on=['uckey'], how='inner') # df_uckey_cluster keeps the ratio and cluster_key for only uckeys that are being processed if not df_uckey_cluster: df_uckey_cluster = df.select( 'uckey', 'cluster_uckey', 'ratio', 'price_cat') df_uckey_cluster.cache() else: df_uckey_cluster = df.select( 'uckey', 'cluster_uckey', 'ratio', 'price_cat').union(df_uckey_cluster) df_uckey_cluster.cache() # [Row(cluster_uckey=u'2469', price_cat=u'2', cluster_day_price_imp_list=[[{u'2019-11-02': [u'2:90']}, {u'2019-11-03': [u'2:172']}]])]) df = df.groupBy('cluster_uckey', 'price_cat').agg( collect_list('day_price_imp_map_list').alias('cluster_day_price_imp_list')) df = df.withColumn('ts', udf(sum_day_count_array, ArrayType(MapType(StringType(), ArrayType(StringType()))))(df.cluster_day_price_imp_list)) # [Row(cluster_uckey=u'2469', price_cat=u'2', ts=[{u'2019-11-02': [u'2:90'], u'2019-11-03': [u'2:172']}])] df = df.drop('cluster_day_price_imp_list') if not df_prediction_ready: df_prediction_ready = df df_prediction_ready.cache() else: df = df_prediction_ready.union(df) df = df.groupBy('cluster_uckey', 'price_cat').agg( collect_list('ts').alias('ts_list')) df = df.withColumn('ts', udf(sum_day_count_array, ArrayType(MapType(StringType(), ArrayType(StringType()))))(df.ts_list)) df = df.drop('ts_list') # [Row(cluster_uckey=u'magazinelock,03,WIFI,g_f,1,CPM,60', ts=[{u'2019-11-02': [u'1:2']}])] df_prediction_ready = df df_prediction_ready.cache() # [Row(cluster_uckey=u'1119', price_cat=u'2', ts=[{u'2019-11-02': [u'1:862', u'3:49', u'2:1154'], u'2019-11-03': [u'1:596', u'3:67', u'2:1024']}])] df = df_prediction_ready df = df.join(df_norm, on=['cluster_uckey', 'price_cat'], how='inner') # [Row(cluster_uckey=u'1119', price_cat=u'2', ts=[{u'2019-11-02': [u'1:862', u'3:49', u'2:1154'], u'2019-11-03': [u'1:596', u'3:67', u'2:1024']}], a__n=-0.005224577616900206, a_1_n=0.6089736819267273, a_2_n=-0.21013110876083374, a_3_n=0.16884993016719818, a_4_n=-0.3416250944137573, a_5_n=0.15184317529201508, a_6_n=-0.16529197990894318, t_UNKNOWN_n=-0.4828081429004669, t_3G_n=1.2522615194320679, t_4G_n=-0.15080969035625458, t_WIFI_n=-0.35078370571136475, t_2G_n=1.991615653038025, g__n=-0.08197031915187836, g_g_f_n=0.010901159606873989, g_g_m_n=-0.21557298302650452, g_g_x_n=1.4449801445007324, price_cat_1_n=-1.2043436765670776, price_cat_2_n=1.885549783706665, price_cat_3_n=-0.48205748200416565, si_vec_n=[-0.20294927060604095, -0.27017056941986084, -0.16821187734603882, -0.20294314622879028, -0.11777336895465851, 0.9738097786903381, 0.23326143622398376, -0.16500996053218842, -0.19148004055023193, -0.15753313899040222, -0.149298757314682, -0.19954630732536316, -0.15968738496303558, 0.12466698884963989, -0.15369804203510284, 0.04789407551288605, -0.22501590847969055, 0.14411255717277527, -0.209896981716156, -0.17969290912151337, 0.06794296950101852, -0.12367484718561172, 0.5581679344177246, 0.8108972311019897, -0.20487570762634277, 2.597964286804199, -0.2720063328742981, 0.1152268648147583, 0.27174681425094604, -0.20653237402439117, -0.2899857461452484, -0.15441325306892395, -0.17766059935092926, -0.11622612923383713, 0.3738412857055664, 1.0858312845230103, 0.6114567518234253], r_vec_n=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5', ratio=0.11989551782608032)] df = df.join(df_uckey_cluster, on=[ 'cluster_uckey', 'price_cat'], how='inner') predictor_udf = udf(transform.predict_daily_uckey(days=day_list, serving_url=serving_url, forecaster=forecaster, model_stats=model_stats, columns=df.columns), MapType(StringType(), FloatType())) df = df.withColumn('day_prediction_map', predictor_udf(struct([df[name] for name in df.columns]))) # [Row(cluster_uckey=u'1119', price_cat=u'2', day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0}, ratio=0.11989551782608032, uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5')] df = df.select('cluster_uckey', 'price_cat', 'day_prediction_map', 'ratio', 'uckey') # [Row(ucdoc_elements=Row(price_cat=u'2', ratio=0.11989551782608032, day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0}), uckey=u'native,66bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPC,5')] ucdoc_elements_type = StructType([StructField('price_cat', StringType(), False), StructField( 'ratio', FloatType(), False), StructField('day_prediction_map', MapType(StringType(), FloatType()), False)]) df = df.withColumn('ucdoc_elements_pre_price_cat', udf(lambda price_cat, ratio, day_prediction_map: (price_cat, ratio, day_prediction_map), ucdoc_elements_type)(df.price_cat, df.ratio, df.day_prediction_map)).select('ucdoc_elements_pre_price_cat', 'uckey') # [Row(uckey=u'splash,d971z9825e,WIFI,g_m,1,CPT,74', ucdoc_elements=[Row(price_cat=u'1', ratio=0.5007790923118591, day_prediction_map={u'2019-11-02': 220.0, u'2019-11-03': 305.0})])] df = df.groupBy('uckey').agg(collect_list('ucdoc_elements_pre_price_cat').alias('ucdoc_elements')) df = df.withColumn('prediction_output', udf(transform.generate_ucdoc(traffic_dist), StringType())( df.uckey, df.ucdoc_elements)) df_predictions_doc = df.select('uckey', 'prediction_output') rdd = df_predictions_doc.rdd.map(lambda x: transform.format_data(x, 'ucdoc')) rdd.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_write_conf) sc.stop()
def main(): # set up the logger logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # NSJOIN dayidx # only partitioned by DAY day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1] # BAREBONES dayidx # only partitioned by DAY day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x] # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid) mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/') [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data] if day_idx != day_mps: logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.') return if len(day_bb) == 0: logger.warning('mapper.barebone data missing for this particular day.') #return logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) logger.info('begin spark process.') getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx) geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx sc = SparkContext() hiveCtx = HiveContext(sc) rows = hiveCtx.sql(getting_mappoint_data) regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx)) geo_total_cap = hiveCtx.sql(geo_total_cap_query) geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query) # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]] region_mpginfo_pair = rows.map(lambda x: [[x.mpgid, x.lat, x.lon, x.country, x.mpgload, x.asnum, x.ip], x.allowed_private_regions])\ .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]]) #region_mpginfo_pair.first() # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]] # ps. prp=1: private, prp=0: public region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude, x.longitude, x.peak_bitcap_mbps, x.peak_flitcap_mfps, x.country, x.numvips, 'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'), 1 if x.prp=='private' else 0]])\ .filter(lambda x: x[1][6]=='W' or x[1][6]=='S') region_public_list = region_latlon\ .filter(lambda x: x[1][7] == 0)\ .map(lambda x: ('all', [[x[0]]]))\ .reduceByKey(lambda a, b: [a[0]+b[0]])\ .map(lambda x: x[1][0]).collect() region_public_list = [0] + sorted(region_public_list[0]) # dummy region rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])]) region_latlon = region_latlon.union(rdd2) # perform the join into tuple of (K, (V1, V2): # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service])) # rdd = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance], # reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services, # mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon) mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0], x[0], geodesic_distance(x[1][0][1], x[1][0][2], x[1][1][0], x[1][1][1]), round(float(x[1][1][2])/1000.0, 3), round(float(x[1][1][3])/1000.0, 3), x[1][1][4], # reg-country x[1][1][5], # reg-numvips x[1][1][6], # reg-services x[1][0][3], x[1][0][4], x[1][0][5], x[1][0][6], x[1][0][1], x[1][0][2]]) # filtering on mapping distance < 500 miles # filtering on reg-country = mpg-country # filtering on region capacity fbps > 1Gbps # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon]) #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\ # .filter(lambda x: x[5] == x[8])\ # .filter(lambda x: x[3] > 1)\ # .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) # or this one, no-same-country constraint: mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\ .filter(lambda x: x[3] > 1)\ .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) #mpgid_reg_distance.first() # group by mpgid # rdd format = (mpgid, [[reg-list], # avg_distance, # total_cap freeflow, # total_cap essl, # total num vips, # rg_count, # mpg-country, # mpg-load, # [mpg-asnum], # [mpg-nsip]) mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4], a[5], a[6], a[7], a[8], a[9], a[10]])\ .map(lambda x: (x[0], [sorted(x[1][0]), # region_list round(x[1][1]/x[1][4], 2), # avg distance round(x[1][2][0], 2), # total capacity - w round(x[1][2][1], 2), # total capacity - s x[1][3], # numvips x[1][4], # total region count x[1][5], # mpg country x[1][6], # mpg load x[1][7], # mpg asnum x[1][8], # mpg nsip x[1][9], # mpg lat x[1][10]])) # mpg lon # disable the count #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count() # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips # reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]]) reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\ .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list x[1][1], # avg_distance x[1][2], # region total capacity freeflow x[1][3], # region total capacity essl x[1][4], # total num vips x[1][5], # total region count [x[1][6]], # mpg country list x[1][7], # mpg load 1, # mpg-count x[1][8] if x[1][8] else [], # [mpg-asnum] x[1][9] if x[1][9] else [], # [mpg-nsip] [x[1][10]], # [mpg-lat] # single element array [x[1][11]], # [mpg-lon] # single element array [x[1][7]] # [mpg-load] # single element array ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1], a[2], a[3], a[4], a[5], a[6]+b[6], a[7]+b[7], a[8]+b[8], a[9]+b[9], a[10]+b[10], a[11]+b[11], a[12]+b[12], a[13]+b[13]])\ .filter(lambda x: sum(x[1][13]) > 0.0001)\ .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list x[1][7], # mpg-load x[1][8], # mpg-count [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum] [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip] geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma] ]))\ .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list] x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count x[1][7], # mpg-load x[1][8], # mpg-count ':'.join([str(y) for y in x[0]]), # [region-list] ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list] ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum] ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip] x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma] ], region_public_list ))\ .flatMapValues(lambda x: x)\ .map(lambda x: [x[1], x[0]]) reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\ .join(region_latlon)\ .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1])] + [x[0]] if x[0] > 0\ else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1], 0.0]] + [x[0]])\ .filter(lambda x: x[2][4] < 500)\ .map(lambda x: (tuple([x[0][0], x[0][1], x[0][2], x[0][3], x[0][4], x[0][5], x[0][6], x[0][7], x[0][8], x[0][9], x[0][10], x[0][11], x[0][12][0], x[0][12][1], x[0][12][2], x[0][12][3]]), # mpg-information [x[1][2], # pub.region.cap.ff x[1][3], # pub.region.cap.essl x[1][5], # pub.region.vip [x[3]] # single element region id ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff ) a[1]+b[1], # sum( pub.region.cap.essl ) a[2]+b[2], # sum( pub.region.cap.vip ) a[3]+b[3] # [pub.regions] ])\ .map(lambda x: [x[0][0], # [mpg-country-list] x[0][1], # avg-distance x[0][12], # mpg-lat x[0][13], # mpg-lon x[0][14], # mpg-por x[0][15], # mpg-porsigma x[0][2], # pri.region.cap.ff (gbps) x[0][3], # pri.region.cap.essl (gbps) x[0][4], # pri.vips x[0][5], # pri.region.count round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps) round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps) x[1][2], # pub.vips len(x[1][3])-1, # pub.region.count x[0][6], # mpg-load round(x[0][7], 6), # mpg-count x[0][8], # [pri reg-list] ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list]) x[0][9], # [mpg-list] x[0][10], # [mpg-assum] x[0][11] # [mpg-nsip] ]) # data exporting to local country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(columns=['cl_geoname', 'cl_avgDistance', 'cl_lat', 'cl_lon', 'cl_por', 'cl_porsigma', 'pri_cap_ff_gbps', 'pri_cap_essl_gbps', 'pri_nvips', 'pri_nReg', 'pub_cap_ff_gbps', 'pub_cap_essl_gbps', 'pub_nvips', 'pub_nReg', 'cl_mpgLoad', 'cl_nMpg', 'pri_regList', 'pub_regList', 'mpgList', 'mpgASList', 'mpgNSIPList']) geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect() logger.info('begin write to local disk.') for item in range(len(geo_cluster_full_info)): temp = geo_cluster_full_info[item] country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[item] = temp # the above should be temp[1][0] for the mpglist data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data' filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx) fileDestination = os.path.join(data_folder, filename) country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(fileDestination, sep=',', index=False, header=False) logger.info('begin to upload to hdfs.') tablename = 'mrqos.mpg_cluster' hdfs_d = os.path.join(config.hdfs_table, 'mpg_cluster', 'datestamp=%s' % day_idx, 'uuid=%s' % uuid_idx) partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx) processed_filename = '.'.join(filename.split('.')[0:-1])+'.processed.csv' cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (os.path.join(data_folder, filename), os.path.join(data_folder, processed_filename)) sp.check_call(cmd_str, shell=True) try: beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger) # os.remove(fileDestination) except sp.CalledProcessError as e: logger.info('upload to HDFS + update Hive table failed.')
def __init__(self, data_source='local', activity_directory='strava-activities-subset', s3bucket='larsbk', athletes=None, activity_types=[ 'Ride', 'Run', 'NordicSki' ], sc=None, hiveContext=None, conf=(SparkConf().setAppName('Strava analysis')), filter_bug_inducing_rows=True ): ''' Initialize Strava Analysis object''' # INPUT PARAMETERS self.athletes = athletes # Athletes to analyze (optional) self.activity_types = activity_types # Activity_types to consider (default) self.filter_bug_inducing_rows = filter_bug_inducing_rows # CONFIGURE SPARK if sc != None and hiveContext != None: # Both contexts were supplied by user print 'Info: Using supplied SparkContext and HiveContext' self.sc = sc self.hiveContext = hiveContext else: # Initialize new contexts print 'Info: Intitializing SparkContext and hiveContext from (default) conf' self.sc = SparkContext(conf=conf) self.hiveContext = HiveContext(self.sc) self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema self.df = None # Empry DataFrame to be populated later # CONFIGURE DATA SOURCE data_root_path = { 's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 'local': './%s/' % activity_directory } if data_source not in data_root_path.keys(): # Check if data source is valid raise Exception(('Unrecognized data source %s. ' 'Supported sources: "%s".') \ % '", "'.join(data_root_path.keys())) self.data_source = data_source # This is a valid data source self.path = data_root_path[data_source] # This is the path to the data # (S3 SPECIFIC STUFF) if data_source == 's3': # Get a list of files in he activity_directorys bucket = boto3.resource('s3').Bucket(s3bucket) objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory) files = [obj.key for obj in objects] # Make set of observed combinations of athlete and activity_type athlete_and_type = set([]) # Empty set to populate fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern for fname in files: match = re.match(activity_directory+'/gpx'+fpattern, fname) if match: athlete_and_type.add((match.group(1), match.group(2))) self.s3_athlete_and_type = athlete_and_type # Save set for later use pass
if i + 5 < len(x): dx["5day"] = int(x[i + 5].close / dx["high"]) else: dx["1day"] = -1 dx["2day"] = -1 dx["3day"] = -1 dx["4day"] = -1 dx["5day"] = -1 dx["label"] = 0 l.append(dx) return l def main(sc, sqlContext): dfSC = sqlContext.read.table("eod2") rddMat = cal(sc, sqlContext, dfSC) df = sqlContext.createDataFrame(rddMat) dfToTable(sqlContext, df, "ta_gupbreak") if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName=__file__, conf=conf) #sc = SparkContext("local[4]", conf=conf) sql_context = HiveContext(sc) sql_context.sql("use fex") main(sc, sql_context) sc.stop()
# encoding: utf-8 from pyspark.ml.pipeline import PipelineModel from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql import HiveContext from pyspark import SparkContext sc = SparkContext(appName = "Eval Model App") log4jLogger = sc._jvm.org.apache.log4j log = log4jLogger.LogManager.getLogger(__name__) sqlContext = HiveContext(sc) log.warn("Lectura de la informacion") df = sqlContext.read.json("/raw/msmk") df.registerTempTable("jsons") sqlContext.sql(""" select retweet_count, count(*) as total from jsons group by retweet_count order by 2 desc """).show() log.warn("Aplicación del modelo") model = PipelineModel.load("/models/best_model") dataset = sqlContext.sql(""" select id as _id, text, retweet_count, user.screen_name as user
from pyspark.sql.functions import * from pyspark.sql import HiveContext from pyspark import SparkContext ################################################################################################################################################################## ################################################################################################################################################################## #Set arguments dicParameteres = {} dicParameteres['arg1'] = argv[1] # arg1: argumento 1 vindo da shell #Create context yarn_name = "NOME_QUE_IRA_APARECER_NO_YARN" sc = SparkContext('yarn-cluster', yarn_name) sqlContx = HiveContext(sc) sqlContx.setConf('spark.sql.parquet.compression.codec', 'snappy') sqlContx.setConf('hive.exec.dynamic.partition', 'true') sqlContx.setConf('hive.exec.dynamic.partition.mode', 'nonstrict') #imprimindo conteúdo no dicionário criado for key, value in dicParameters.items(): if not key: print('Configuracao inexistente para {0} : {1}'.format(key, value)) else: print('Parametro {0} : {1}'.format(key, value)) ################################################################################################################################################################## ################################################################################################################################################################## def startProcess(dicParameters):
class SparkOperator(DBOperator): """SPARK操作类""" def __init__(self, database_cat, database_str, logger): '''\ 初始化一个连接 ''' DBOperator.__init__(self, database_cat, database_str, logger) script_name = __name__ conf = SparkConf().setAppName(script_name).setMaster("yarn") conf.set('spark.executor.instances', '5') conf.set('spark.executor.cores', '1') conf.set('spark.executor.memory', '2g') conf.set('spark.driver.memory', '1g') conf.set('spark.sql.shuffle.partitions', '5') conf.set("hive.exec.dynamic.partition.mode", "nonstrict") conf.set("hive.exec.max.dynamic.partitions", "100000") conf.set("hive.exec.max.dynamic.partitions.pernode", "100000") self.sc = SparkContext(conf=conf) self.hc = HiveContext(self.sc) self.logger = logger self.logger.info(u'SPARK 连接成功。') def excute_sql(self, sql_str, log_flag='1'): '''\ Function: 在SPARK中执行SQL语句函数 Date : 2018-03-12 16:01:29 Author : Liuym Notes : 传入SQL语句字符串,发送到SPARK SQL中执行,返回是否成功标识及日志信息 Argvs : sql_str -- QL语句字符串 Result : msg_code -- 0-成功,-1-失败 msg_info -- 日志信息 ''' DBOperator.excute_sql(self, sql_str, log_flag) #若SQL语句为空,跳过实际执行。 if self.msg_code == 9: return (0, self.msg_info) try: self.hc.sql(self.sql_str) except Exception as e: self.msg_code = -1 self.msg_info = u"SQL语句执行错误:" + str(e).replace("\'", "\"")[0:3000] else: self.msg_code = 0 self.msg_info = u"脚本运行成功" return (self.msg_code, self.msg_info) def get_results(self, sql_str, log_flag='1'): '''\ Function: 获取SPARK SQL 中结果集 Date : 2018-10-11 16:01:29 Author : Liuym Notes : 传入SQL语句字符串,发送到SPARK SQL中执行,返回是否成功标识,日志信息及结果集 Argvs : sql_str -- QL语句字符串 Result : msg_code -- 0-成功,-1-失败 msg_info -- 日志信息 results -- 结果集 ''' DBOperator.get_results(self, sql_str, log_flag) #若SQL语句为空,跳过实际执行。 if self.msg_code == 9: return (0, self.msg_info) try: self.results = self.hc.sql(self.sql_str) #从SPARK中取得一个DataFrame #self.results.cache() # 持久化当前DataFrame self.record_list = self.results.rdd.map( lambda line: line.asDict().values()).collect() except Exception as e: self.msg_code = -1 self.msg_info = u"SQL语句执行错误:" + str(e).replace("\'", "\"")[0:3000] else: self.msg_code = 0 self.msg_info = u"脚本运行成功" return (self.msg_code, self.msg_info, self.record_list) def close_connection(self): '''\ Function: 关闭数据库的连接 Date : 2018-10-11 16:01:29 Author : Liuym Notes : 关闭数据库的连接,释放资源 ''' DBOperator.close_connection(self) self.sc.stop() self.logger.info(u'SPARK 断开连接。')
def main(): sc = SparkContext(conf=SparkConf().setAppName("wil_hot_sku_calc_online")) hc = HiveContext(sc) sc.addPyFile(sys.argv[1]) from core.common import common param = common.init_params(sys.argv, p) date, pid, ts, dc_id = param["date"], param["pid"], param["ts"], param[ "dc_id"] today = dt.datetime.strptime(date, "%Y-%m-%d").date() yesterday = today - dt.timedelta(1) # 候选以及选入爆品仓的sku sql_sku_all = """ select distinct t.sku_id as sku_id, t.current_source_store_id as store_id, t.future_source_store_id as future_store_id, case when t.b_sku_id is not null then 1 else 0 end as is_selected from (select a.sku_id as sku_id, a.current_source_store_id as current_source_store_id, a.future_source_store_id as future_source_store_id, b.sku_id as b_sku_id from app.app_wil_hot_sku_all a left join app.app_wil_hot_sku_selected b on a.sku_id = b.sku_id ) t """ hc.sql(sql_sku_all).createOrReplaceTempView("tb_sku_all") # 所有的候选sku关联的订单数据 sql_data_all = """ select a.sku_id as sku_id, a.future_store_id as future_store_id, a.is_selected as is_selected, b.out_wh_tm as out_wh_tm, b.ord_id as ord_id, b.sale_qtty as sale_qtty from (select sku_id, store_id, future_store_id, is_selected from tb_sku_all ) a left join (select sku_id, store_id, out_wh_tm, parent_sale_ord_id as ord_id, sale_qtty from app.app_wil_hot_sale_store_sku_ord_sale where dt = '""" + str(yesterday) + """' and sale_qtty >= 0 ) b on a.store_id = b.store_id and a.sku_id = b.sku_id """ hc.sql(sql_data_all).createOrReplaceTempView("tb_data_all") # 标注订单 sql_choice_ord = """ select a.ord_id, case when (a.f1=1 and a.f2=1) or (a.f1=2 and a.f2=1) or (a.f1=2 and a.f2=2 and (a.g1=2 or (a.g2=2 and a.g3=1))) then 1 else 0 end as choice_flag2 from (select ord_id, count(distinct future_store_id) as f1, count(distinct is_selected) as f2, count(distinct future_store_id,is_selected) as g1, count(distinct (case when is_selected = 1 then future_store_id else null end)) as g2, count(distinct (case when is_selected = 0 then future_store_id else null end)) as g3 from tb_data_all group by ord_id ) a """ hc.sql(sql_choice_ord).createOrReplaceTempView("tb_choice_ord") # 统计销量 sql_result = """ select a.sku_id, sum(case when b.choice_flag2 = 1 then a.sale_qtty else 0 end) as sale_in_hot, sum(a.sale_qtty) as sale_total from tb_data_all a left join tb_choice_ord b on a.ord_id = b.ord_id where a.is_selected = 1 group by a.sku_id """ hc.sql(sql_result).createOrReplaceTempView("tb_result") # 默认的分配比例 sql_selected_sku_info = """ select a.sku_id as sku_id, b.dc_id as dc_id, b.hot_sku_target_store_id as hot_sku_target_store_id, b.future_source_store_id as future_source_store_id, b.default_ratio as default_ratio from app.app_wil_hot_sku_selected a left join (select sku_id, dc_id, hot_sku_target_store_id, future_source_store_id, avg(hot_sku_out_store_rate) as default_ratio from app.app_wil_hot_sku_all group by sku_id,dc_id,hot_sku_target_store_id,future_source_store_id ) b on a.sku_id = b.sku_id """ hc.sql(sql_selected_sku_info).createOrReplaceTempView( "tb_selected_sku_info") # 生成最终的爆品比例结果 sql_final_result = """ select a.sku_id, a.dc_id, a.hot_sku_target_store_id, a.future_source_store_id, coalesce(round(b.sale_in_hot/b.sale_total,4),a.default_ratio) as hot_sku_ratio from tb_selected_sku_info a left join tb_result b on a.sku_id = b.sku_id """ hc.sql(sql_final_result).createOrReplaceTempView("tb_final_result") # 把相关的结果整合到app_wil_hot_sku_ratio_result表,以dt,pid,ts为分区 partition = "dt='" + str(param["date"]) + "', pid='" + str( param["pid"]) + "', ts='" + str(param["ts"]) + "'" table_output = 'app.app_wil_hot_sku_ratio_result' sql_save_sd_result = """ insert overwrite table """ + table_output + """ partition (""" + partition + """) select sku_id, dc_id, hot_sku_target_store_id, future_source_store_id, hot_sku_ratio from tb_final_result """ hc.sql(sql_save_sd_result)
from pyspark.sql.functions import rowNumber from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from pyspark.mllib.linalg import VectorUDT import csv import string import re import numpy as np from pyspark.mllib.stat import Statistics conf = SparkConf().setAppName("NLP on Reddit Data") conf.set("spark.driver.maxResultSize", "3g") sc = SparkContext(conf=conf) # notice here we use HiveContext(sc) because window functions require HiveContext sqlContext = HiveContext(sc) stopwordsList = stopwords.words('english') def parse_csv(x): x = x.replace('\n', '') d = csv.reader([x]) return next(d) def isInt(s): try: int(s) return True except ValueError: return False
# BAREBONES dayidx # only partitioned by DAY #day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x] # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid) #mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/') #[day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data] day_idx = '20160819' uuid_idx = '14863360-65fc-11e6-a709-300ed5c5f881' getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx) geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx sc = SparkContext() hiveCtx = HiveContext(sc) rows = hiveCtx.sql(getting_mappoint_data) #regInfoRows = hiveCtx.sql('select a.*, b.region_capacity, b.ecor_capacity, b.prp, case b.peak_bitcap_mbps when null then 0 else b.peak_bitcap_mbps end peak_bitcap_mbps, case b.peak_flitcap_mfps when null then 0 else b.peak_flitcap_mfps end peak_flitcap_mfps from (select * from mapper.barebones where day=%s and latitude is not NULL and longitude is not NULL and ghost_services in ("W","S","KS","JW")) a join (select * from mapper.regioncapday where day=%s) b on a.region=b.region' % (day_idx, day_idx)) regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx)) geo_total_cap = hiveCtx.sql(geo_total_cap_query) geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query) # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]] region_mpginfo_pair = rows.map(lambda x: [[x.mpgid, x.lat, x.lon, x.country, x.mpgload,
from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql import HiveContext import mysql.connector import math from pyspark.sql import Row, StructField, StructType, StringType, IntegerType sc = SparkContext("local", "videoDifficulty") sqlContext = HiveContext(sc) timeFrameSize = 4 # timeFrameBuckets = [ [0,0.0] for x in range(0, int(math.ceil(duration/timeFrameSize)) ) ] def get_partial_keys(dictionary, field_id, field_value): """ takes a dictionary with a complex key tuple, a list of the id's of the fields in the tuple, and a tuple for the partial key for which to find the corresponding value from the dictionary input: field_id must be a list, eg: [1], [0,2,3], etc. field_value must be tuple, eg: ('a,1.0,1) """ field_id.sort() keylist = list() for key in dictionary.keys(): if ( tuple([key[x] for x in field_id]) == field_value ): # Get the value of the partial key from the list of keys forthe dict, and keylist.append(key) # add it if it matches with the key value being sought. return keylist
for record in records: writer.writerow(record) return [output.getvalue()] if not os.path.isdir('output/csv'): fullFileData.mapPartitions(writeRecords).saveAsTextFile('output/csv') sc.stop() print('\n\nCreate sql table with txt file') sc = SparkContext(conf=conf) url = 'https://raw.githubusercontent.com/apache/spark/master/examples/src/main/resources/kv1.txt' inputFile = 'input/kv1.txt' inputTable = 'myTable' if not os.path.isfile(inputFile): wget.download(url, inputFile) hiveCtx = HiveContext(sc) if not os.path.isdir(f'spark-warehouse/{inputTable}'): hiveCtx.sql( f"CREATE TABLE IF NOT EXISTS {inputTable} (key INT, value STRING)") hiveCtx.sql(f"LOAD DATA LOCAL INPATH '{inputFile}' INTO TABLE myTable") input = hiveCtx.sql(f"FROM {inputTable} SELECT key, value") print(f'myTable query: {input.rdd.take(10)}') print(f'myTable key: {input.rdd.map(lambda row: row[0]).take(10)}') sc.stop() print('\n\nRead json file into table') sc = SparkContext(conf=conf) hiveCtx = HiveContext(sc) tweets = hiveCtx.read.json('input/tweets.json') tweets.registerTempTable("tweets") results = hiveCtx.sql("SELECT user.name, text FROM tweets")
# Createas a hive table and loads an input file into it # For input you can use examples/src/main/resources/kv1.txt from the spark # distribution from pyspark import SparkContext from pyspark.sql import HiveContext import json import sys if __name__ == "__main__": if len(sys.argv) != 4: print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]" sys.exit(-1) master = sys.argv[1] inputFile = sys.argv[2] inputTable = sys.argv[3] sc = SparkContext(master, "LoadHive") hiveCtx = HiveContext(sc) # Load some data into hive hiveCtx.sql( "CREATE TABLE IF NOT EXISTS " + inputTable + " (key INT, value STRING)") hiveCtx.sql( "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
class Unittest_HWM_Allocation_MulDays3(unittest.TestCase): def setUp(self): warnings.simplefilter("ignore", ResourceWarning) fpath = os.path.abspath(os.path.join(os.path.dirname(__file__),"..")) with open(fpath + '/data_source/bookings_fully_overlapped.json') as bookings_source: self.bookings = json.load(bookings_source) with open(fpath + '/data_source/cfg.json') as cfg_source: self.cfg = json.load(cfg_source) today = '20180402' self.days = optimizer.util.get_days_from_bookings(today, self.bookings) self.sc = SparkContext.getOrCreate() self.hive_context = HiveContext(self.sc) self.schema = optimizer.util.get_common_pyspark_schema() def compare_two_dfs(self, pandas_df_expected, df_to_test_rows): df = self.hive_context.createDataFrame(df_to_test_rows, self.schema) df_allocated = optimizer.algo.hwm.hwm_allocation(df, self.bookings, self.days) pandas_df_allocated = df_allocated.select("*").toPandas() print(pandas_df_expected) print(pandas_df_allocated) return self.assertTrue(assert_frame_equal(pandas_df_expected, pandas_df_allocated, check_dtype=False) == None) def test_hwm_allocation_case1(self): # Print out the name of the current testing function to track logs # Testcase type: 1 booking bucket with 1 booking_id in ands # testcase 1: booking bucket ['20180402', ['b80'], ['b60'], {}, 3239] pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b80'], ['b60'], 3239, {'b80': 18}] df_to_test_rows = = [(['20180402', ['b80'], ['b60'], {}, 3239])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case2(self): # bk_id: b11, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 11 # bk_id: b12, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 12 # Testcase type: 1 booking bucket with 2 booking_id in ands pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b120', 'b110'], [], 8900, {'b120': 2, 'b110': 2}] df_to_test_rows = = [(['20180402', ['b120', 'b110'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case3(self): # Testcase type: 1 booking bucket with 3 booking_id in ands pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b60', 'b70', 'b100'], [], 8900, {'b60': 13, 'b70': 15, 'b100': 20}] df_to_test_rows = = [(['20180402', ['b60', 'b70', 'b100'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case4(self): # Testcase type: 2 booking buckets with 4 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b80'], ['b60', 'b70', 'b90'], 3239, {'b80': 18}] pandas_df_expected.loc[1] = ['20180402', ['b60', 'b70'], ['b80', 'b90'], 8900, {'b60': 13, 'b70': 15}] df_to_test_rows = = [(['20180402', ['b80'], ['b60', 'b70', 'b90'], {}, 3239]), (['20180402', ['b60', 'b70'], ['b80', 'b90'], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case5(self): # Testcase type: 3 booking buckets with 5 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b60', 'b70', 'b100', 'b110', 'b120'], ['b80', 'b90'], 8900, {'b60': 13, 'b70': 15, 'b100': 20, 'b110': 2, 'b120': 2}] df_to_test_rows = = [(['20180402', ['b60', 'b70', 'b100', 'b110', 'b120'], ['b80', 'b90'], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case6(self): # bk_id: b13, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 130000 # bk_id: b12, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 12 # Testcase type: 3 booking buckets with 5 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b130', 'b120'], [], 8900, {'b130': 8900}] df_to_test_rows = = [(['20180402', ['b130', 'b120'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case7(self): # bk_id: b15, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 8900 # bk_id: b14, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 8900 # Testcase type: 3 booking buckets with 5 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b150', 'b140'], [], 8900, {'b150': 1780, 'b140': 1780}] df_to_test_rows = = [(['20180402', ['b150', 'b140'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case8(self): # bk_id: b17, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4450 # bk_id: b16, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4450 # Testcase type: 3 booking buckets with 5 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b170', 'b160'], [], 8900, {'b170': 890, 'b160': 890}] df_to_test_rows = = [(['20180402', ['b170', 'b160'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case9(self): # bk_id: b18, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4451 # bk_id: b17, days: ['20180401', '20180402', '20180403', '20180404', '20180405'], a: ['4'], g: ['g_f'], si: ['2'], amount: 4450 # Testcase type: 3 booking buckets with 5 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b180', 'b170'], [], 8900, {'b180': 890, 'b170': 890}] df_to_test_rows = = [(['20180402', ['b180', 'b170'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case10(self): # Testcase type: 3 booking buckets with 5 bookings included. pandas_df_expected = pandas.DataFrame(columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = ['20180402', ['b60', 'b70', 'b100', 'b120', 'b160', 'b170', 'b180'], ['b80', 'b90'], 8900, {'b120': 2, 'b100': 20, 'b160': 890, 'b170': 890, 'b180': 890, 'b70': 15, 'b60': 13}] # b6, b7, b10, b12, b16, b17, b18 have the same attributes. df_to_test_rows = = [(['20180402', ['b60', 'b70', 'b100', 'b120', 'b160', 'b170', 'b180'], ['b80', 'b90'], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)
return str_time if __name__ == '__main__': if (len(sys.argv) != 1): print "Usage: spark-submit <Python Code File>" sys.exit(1) #App name which shows up in the Spark UI sc = SparkContext(appName='User Recommendation') #Context provides connection to Hive metastore sqlContext = HiveContext(sc) ''' Pulling data out of Hive. I created a relication of 'watson_bisum_purchases' table locally to test. ''' rdd = sqlContext.sql("SELECT person_id,deal_id,aasm_state FROM watson_bisum_purchases") ''' Creating datasets. Formating the data and also creating sample datasets in order to create and test the model. ''' #Formating all the data using the 'parse_rating' method above all_data = rdd.map(parse_rating)
import atexit import platform import py4j import pyspark from pyspark.context import SparkContext from pyspark.sql import SQLContext, HiveContext from pyspark.storagelevel import StorageLevel from pyspark.sql.types import * os.environ['PYSPARK_SUBMIT_ARGS'] = "--master yarn pyspark-shell" sc = SparkContext(appName="pySpark_barker", pyFiles='') atexit.register(lambda: sc.stop()) sc._jvm.org.apache.hadoop.hive.conf.HiveConf() sqlCtx = HiveContext(sc) print("""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version %s /_/ """ % sc.version) print("Using Python version %s (%s, %s)" % (platform.python_version(), platform.python_build()[0], platform.python_build()[1])) print("SparkContext available as sc, %s available as sqlCtx." % sqlCtx.__class__.__name__) print("TODO: add specific arg settings")
print(""" Error: This program takes 2 arguments Usage: bin/spark-submit --master <spark-master> nrem.py <input dir> <output dir> """) sys.exit(1) #Parallelism partitions = 1 #Output Directories matched_output = os.path.join(sys.argv[2],"matched") eliminated_output = os.path.join(sys.argv[2], "eliminated") conf = SparkConf().setAppName("Non Redundant Entity Matching") sc = SparkContext(conf=conf) sqlCtx = HiveContext(sc) def attr_key(l): """ [obj, attr1, attr2, attr3 ...] -> [(attr1, obj), (attr2, obj), (attr3, obj) ...] """ a = [] for attr in l[1:]: a.append((attr, l[0])) return a """ Assuming input file(s) to be tsv, and first field to be object and rest of the fields as attributes """ #Read input inRDD = sc.textFile(sys.argv[1], partitions)
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row conf = SparkConf() sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) sqlContext.sql("DROP TABLE people") sqlContext.sql( "CREATE TABLE IF NOT EXISTS people(name STRING, age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY \",\"" ) rdd = sc.textFile("/user/cloudera/spark/people.txt") \ .map(lambda x: x.split(",")) \ .map(lambda x: Row( name=x[0], age=int(x[1]) )) df = sqlContext.createDataFrame(rdd) df.select(df.name, df.age).write.insertInto("people") df2 = sqlContext.sql("select * from people").show()
def _extract_logs(self): sql_context = HiveContext() logs = sql_context.table(self.source_table) return logs
from pyspark.sql import HiveContext # sc is an existing SparkContext. sqlContext = HiveContext(sc) # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. shiguang = sqlContext.read.json("/user/1707500/shiguangwang.json") # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) shiguang.printSchema() # Register this DataFrame as a table. shiguang.registerTempTable("shiguangwang") # SQL statements can be run by using the sql methods provided by `sqlContext`. sqlContext.sql("use sparktest") teenager = sqlContext.sql("CREATE TABLE shiguang asSELECT * FROM shiguangwang")
('spark.executor.instances', '10'), # number of executors in total ('spark.yarn.am.memory', '10g') ]) # memory for spark driver (application master) spark = SparkSession.builder \ .master("yarn") \ .appName("name") \ .enableHiveSupport() \ .config(conf = conf) \ .getOrCreate() return spark spark = get_Spark() spark_context = spark.sparkContext hc = HiveContext(spark_context) normaltimeFormat = "yyyyMMddHHmmss" def run_cmd(args_list): proc = subprocess.Popen(args_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() return proc.returncode def build_schema(model): types = { "string": StringType(),
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier # Initialize Spark SparkContext.setSystemProperty("spark.executor.memory", "4g") conf = SparkConf() conf.set("spark.executor.instances", 20) sc = SparkContext("yarn-client", "kdd99", conf=conf) hc = HiveContext(sc) kdd = hc.table("kdd99") (trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42) trainData.cache() services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct() testData = testData.join(services, testData.service == services.srvc) # filter out any rows with a service not trained upon testData.cache() print "training set has " + str(trainData.count()) + " instances" print "test set has " + str(testData.count()) + " instances" # Build model inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat") inx2 = StringIndexer(inputCol="service", outputCol="service-cat") inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat") inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label") ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
def setupHive(sc): from pyspark.sql import HiveContext hc = HiveContext(sc) return hc
def main(args): """ Main code for relevance computation """ start_time = time.time() # iq (code snippets that set below properties have been removed) driver = url = username = password = inputs = [driver, url, username, password] filename = str(args[0]) if os.path.exists(filename): pass else: sys.exit("Input file %s not found" % filename) file = open(filename, 'r') for line in file: key, val = line.split(",") if str(key).strip() == "dbalias": dbalias = str(val).strip() elif str(key).strip() == "numpartitions": numpartitions = int(val) elif str(key).strip() == "datadir": datadir = str(val).strip() else: print("Invalid key not set: %s" % str(key)) # Need to make sure that the datadir variable is set. try: print("datadir = '%s' " % datadir) except NameError: sys.exit("'datadir' variable not set. Check inputfile '%s'" % (datadir, filename)) # Spark and Hive contexts conf = SparkConf() sc = SparkContext(conf = conf) sqlContext = HiveContext(sc) df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic") if df is None: sys.exit("'traffic' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "uniquedata") df = None df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic") if df is None: sys.exit("'fbtraffic' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "uniqueFBdata") df = None statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\ "from uniquedata ud left outer join uniqueFBdata ufd "\ "on ud.loginid = ufd.loginid and ud.adid = ufd.adid" adswithFBjoined = sqlContext.sql(statement) adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()] adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType') sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data") statement = "Select loginid, count(loginid) as viewcount from data group by loginid" temp = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(temp, "viewdata") statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1" temp2 = sqlContext.sql(statement) sqlContext.sql("drop table data") sqlContext.registerDataFrameAsTable(temp2, "data") temp, temp2 = (None, None) df = utils.returnSparkDF(SQLContext(sc), inputs, "agent") if df is None: sys.exit("'agent' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "agentdata") statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type" unique_adid_per_loginid = sqlContext.sql(statement) unique_adid_per_loginid = unique_adid_per_loginid.drop('counter') sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata") df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite") if df is None: sys.exit("'favorite' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "favdata") df = None statement = "select * from data union all select * from agentdata union all select * from favdata" df2 = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(df2, "uniondata") df2 = None statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid" maxtype = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata") statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\ "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\ "and uniondata.Type = maxconversiondata.UserMaxConversion" data = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(data, "data") # Delete tables tables = ["uniquedata", "FBdata", "uniqueFBdata", "agentdata", "favdata", "uniondata", "maxconversiondata"] for table in tables: sqlContext.sql("drop table if exists %s" % str(table)) df = utils.returnSparkDF(SQLContext(sc), inputs, "adclassified") if df is None: sys.exit("'adclassified' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "addata") df = None df = utils.returnSparkDF(SQLContext(sc), inputs, "geo") if df is None: sys.exit("'geo' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "geodata") df = None statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\ "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\ "from addata, geodata where addata.locationkey = geodata.locationkey" addata_for_join = sqlContext.sql(statement) statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\ "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\ "from addata, geodata where addata.locationkey = geodata.locationkey" addata_for_join = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(addata_for_join, "adtemp") statement = "select * from adtemp where PageViewCount < 10000" addata_for_join = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(addata_for_join, "addata_for_join") data, addata_for_join = (None, None) sqlContext.sql("drop table if exists addata") statement = "select a.*, b.AskingPrice, b.CollectiveDebt, b.PageViewCount, b.Municipal, b.CityPart "\ "from data a, addata_for_join b where a.adid = b.adid" data = sqlContext.sql(statement) data = data.fillna(0) data = data.repartition(numpartitions) # Save the files as csv using spark-csv from databricks try: st = time.time() data.write.format("com.databricks.spark.csv").save(datadir, mode="overwrite", codec="bzip2") et = time.time() print("File save time was: %.2f mins." % ((et-st)/60.)) except: sys.exit("Could not save files to dir '%s'. \n\nError = %s" % (datadir, sys.exc_info()[1])) finally: end_time = time.time() print("Spark ETL execution time = %.2f mins." % ((end_time-start_time)/60.)) # Stop spark and continue using in-memory computation (another script) sc.stop() return
return _out_value def safe_concat(df, column, postfix): _col = sf.when(df[column] != ' ', sf.concat(df[column], sf.lit(postfix))).otherwise('').alias(column) return _col #Spark setup #Set up the Spark connection and Hive context. conf = SparkConf() \ .setAppName('Zeppelin') \ .set('spark.cassandra.connection.host','10.41.49.1') \ .set('spark.master', 'yarn-client') sc = SparkContext.getOrCreate(conf=conf) hive_context = HiveContext(sc) #Define Hive queries #By pushing a query down to Hive instead of loading the whole table into a DataFrame, we minimize the amount of bandwidth and memory that Spark needs to use. ad_phone_query = " SELECT SAMAccountName" \ " , MobilePhone" \ " , telephoneNumber" \ " FROM data_lake.ad_data_raw" advance_phone_query = " SELECT lower(NetID) AS NetID" \ " , ID_Number" \ " , xsequence" \ " , telephone_type_Code" \ " , telephone_number" \ " , extension" \ " , country_code" \ " , area_code" \
"""transform.py""" from pyspark import SparkContext sc = SparkContext("local", "Transform") from pyspark.sql import HiveContext from pyspark.sql.types import * import numpy as np sqlContext = HiveContext(sc) ########## # Util functions def saveAsHiveTable(df, name): # Save it this way so that native HIVE can read the table as well df.registerTempTable("df") sqlContext.sql("drop table if exists " + name) sqlContext.sql("CREATE TABLE " + name + " AS SELECT * FROM df") def getFloat(s): if s is None: return None try: f = float(s) return f except ValueError: return None ##########
import os from pyspark import SparkContext from pyspark.sql import SQLContext, HiveContext from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType from pyspark.sql import functions as F import autofe from autofe.window import window_aggregate, partitionby_aggregate from autofe.aggregate import groupby_aggregate from autofe.functions import spark_functions sc = SparkContext.getOrCreate() sql_context = HiveContext(sc) data_path = os.path.join('luojisiwei.parquet') t = sql_context.read.format('parquet').load(data_path) print(t.show(5)) print(t.dtypes) print(t.head()) print(t.first()) print(t.take(2)) print(t.schema)
class StravaLoader(object): def __init__(self, data_source='local', activity_directory='strava-activities-subset', s3bucket='larsbk', athletes=None, activity_types=[ 'Ride', 'Run', 'NordicSki' ], sc=None, hiveContext=None, conf=(SparkConf().setAppName('Strava analysis')), filter_bug_inducing_rows=True ): ''' Initialize Strava Analysis object''' # INPUT PARAMETERS self.athletes = athletes # Athletes to analyze (optional) self.activity_types = activity_types # Activity_types to consider (default) self.filter_bug_inducing_rows = filter_bug_inducing_rows # CONFIGURE SPARK if sc != None and hiveContext != None: # Both contexts were supplied by user print 'Info: Using supplied SparkContext and HiveContext' self.sc = sc self.hiveContext = hiveContext else: # Initialize new contexts print 'Info: Intitializing SparkContext and hiveContext from (default) conf' self.sc = SparkContext(conf=conf) self.hiveContext = HiveContext(self.sc) self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema self.df = None # Empry DataFrame to be populated later # CONFIGURE DATA SOURCE data_root_path = { 's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 'local': './%s/' % activity_directory } if data_source not in data_root_path.keys(): # Check if data source is valid raise Exception(('Unrecognized data source %s. ' 'Supported sources: "%s".') \ % '", "'.join(data_root_path.keys())) self.data_source = data_source # This is a valid data source self.path = data_root_path[data_source] # This is the path to the data # (S3 SPECIFIC STUFF) if data_source == 's3': # Get a list of files in he activity_directorys bucket = boto3.resource('s3').Bucket(s3bucket) objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory) files = [obj.key for obj in objects] # Make set of observed combinations of athlete and activity_type athlete_and_type = set([]) # Empty set to populate fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern for fname in files: match = re.match(activity_directory+'/gpx'+fpattern, fname) if match: athlete_and_type.add((match.group(1), match.group(2))) self.s3_athlete_and_type = athlete_and_type # Save set for later use pass def _get_athlete_directories(self): ''' Look for athlete directories in data_root_path \ and update self.athletes ''' if self.data_source in ['local']: self.athletes = [ directory for directory in os.listdir(self.path+'gpx/') if re.match('^[\w-]+$', directory) ] else: print ('Warning: Automatic directory/athlete detection not yet supported for ' 'data source %s. Using: "akrogvig", "lkrogvig", "brustad"') \ % self.data_source self.athletes = ['akrogvig', 'lkrogvig', 'brustad'] pass def _activities_exist(self, athlete, activity_type): ''' Checks if there exists activities of type <activity_type> for athlete <athlete>, returns a boolean value ''' # Check local directory with glob if self.data_source == 'local': return glob.glob(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) # Check if combination exists by using previously compiled sets elif self.data_source == 's3': return ((athlete, activity_type) in self.s3_athlete_and_type) def _load_dataset(self): ''' Loads strava activities from source to DataFrame self.df ''' # Get athlete list if not already set if not self.athletes: self._get_athlete_directories() # Initialize empty dataset self.df = self.hiveContext.createDataFrame( self.sc.emptyRDD(), self.schema ) for athlete in self.athletes: for activity_type in self.activity_types: # Check that there are files of that type (or else .load fails) if self._activities_exist(athlete, activity_type): # Read data dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \ .schema(self.schema) \ .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) dfadd = dfadd.withColumn('athlete', lit(athlete)) \ .withColumn('activity_type', lit(activity_type)) self.df = self.df.unionAll(dfadd) if self.filter_bug_inducing_rows: self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull()) pass def derive_schema(self): ''' Loads all data in self.path and derives the schema, saves with pickle to "schema.p" ''' df = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt') \ .load(self.path+'gpx/*') df = df.withColumn('athlete',lit(None).cast(StringType())) \ .withColumn('activity_type',lit(None).cast(StringType())) df.printSchema() pickle.dump(df.schema, open("schema.p", "wb")) pass def get_dataset(self): ''' Returns strava activity dataset ''' if not self.df: self._load_dataset() return self.df
e = error_m(expected, predicted)[0] print(e * 100) if e < 0: e = 0 errs.append(e) print(sum(errs) / (len(errs) * 1.0) * 100) if __name__ == '__main__': cfg = { 'log_level': 'warn', 'trainready_table': 'dlpm_111021_no_residency_no_mapping_trainready_test_12212021', 'dist_table': 'dlpm_111021_no_residency_no_mapping_tmp_distribution_test_12212021', 'serving_url': 'http://10.193.217.126:8503/v1/models/dl_test_1221:predict', 'max_calls': 4, 'model_stat_table': 'dlpm_111021_no_residency_no_mapping_model_stat_test_12212021', 'yesterday': 'WILL BE SET IN PROGRAM' } sc = SparkContext.getOrCreate() hive_context = HiveContext(sc) sc.setLogLevel(cfg['log_level']) run(cfg=cfg, hive_context=hive_context)
MYSQL_USERNAME = '******' MYSQL_PASSWORD = '******' MYSQL_CONNECTION_URL = "jdbc:mysql://localhost:3306/employees?user="******"&password="******"spark://Box.local:7077") conf.setAppName("MySQL_import") conf.set("spark.executor.memory", "1g") # Initialize a SparkContext and SQLContext sc = SparkContext(conf=conf) sql_ctx = SQLContext(sc) # Initialize hive context hive_ctx = HiveContext(sc) # Source 1 Type: MYSQL # Schema Name : EMPLOYEE # Table Name : EMPLOYEES # + --------------------------------------- + # | COLUMN NAME| DATA TYPE | CONSTRAINTS | # + --------------------------------------- + # | EMP_NO | INT | PRIMARY KEY | # | BIRTH_DATE | DATE | | # | FIRST_NAME | VARCHAR(14) | | # | LAST_NAME | VARCHAR(16) | | # | GENDER | ENUM('M'/'F')| | # | HIRE_DATE | DATE | | # + --------------------------------------- + df_employees = sql_ctx.load(
将Hive的查询作为Spark的任务提交到Spark集群上进行计算。通过该项目, 可以提高Hive查询的性能,同时为已经部署了Hive或者Spark的用户提供了更加灵活的选择, 从而进一步提高Hive和Spark的普及率。 """ import sys reload(sys) sys.setdefaultencoding('utf-8') from pyspark import SparkContext from pyspark.sql import HiveContext if __name__ == '__main__': sc = SparkContext(appName = "sql_insert") sqlContext = HiveContext(sc) # driver = "com.mysql.jdbc.Driver" dff = sqlContext.read.format("jdbc").options(url="jdbc:mysql://192.168.32.1:3306/testdjango?user=root" "&password=root&useUnicode=true&characterEncoding=UTF-8" "&zeroDateTimeBehavior=convertToNull", dbtable="t_userinfo").load() dff.registerTempTable('t_userinfo') dft = sqlContext.read.format("jdbc").options(url="jdbc:mysql://192.168.32.1:3306/testdjango?user=root" "&password=root&useUnicode=true&characterEncoding=UTF-8" "&zeroDateTimeBehavior=convertToNull", dbtable="t_userinfo2").load() dft.registerTempTable('t_userinfo2') ds = sqlContext.sql('select username,password,email from t_userinfo') print ds.collect() # ds 是datafram 类型 collect()转成 Row 可以遍历 # for i in ds.collect(): # # print i.username
if __name__ == "__main__": parser = argparse.ArgumentParser(description='Prepare data') parser.add_argument('config_file') args = parser.parse_args() # Load config file with open(args.config_file, 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) resolve_placeholder(cfg) cfg_log = cfg['log'] cfg = cfg['pipeline'] sc = SparkContext() hive_context = HiveContext(sc) sc.setLogLevel(cfg_log['level']) # save table as tfrecords path = cfg['tfrecords']['tfrecords_hdfs_path'] input_table_name = cfg['normalization']['output_table_name'] command = """ SELECT * FROM {} """.format(input_table_name) df = hive_context.sql(command) df.write.format("tfrecords").option("recordType", "Example").mode('overwrite').save(path) sc.stop()
vectorT = row['words_title'] vectorD = row['words_desc'] data = row.asDict() data['words'] = vectorT + vectorD w=[] for word in data['words']: w += words(word) data['wordsF'] = w newRow = Row(*data.keys()) newRow = newRow(*data.values()) return newRow sc = SparkContext.getOrCreate() sqlContext = HiveContext(sc) print "###############" # READ data data = sqlContext.read.format("com.databricks.spark.csv").\ option("header", "true").\ option("inferSchema", "true").\ load("train.csv").repartition(100) print "data loaded - head:" print data.head() print "################" attributes = sqlContext.read.format("com.databricks.spark.csv").\ option("header", "true").\ option("inferSchema", "true").\ load("attributes.csv").repartition(100)
from pyspark import SparkContext sc = SparkContext("local", "best_hospitals") from pyspark.sql import HiveContext sqlContext = HiveContext(sc) # Select the top 10 hospital by average avgscore # Please note that we filter out those hospital not qualified for evaluation df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \ from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \ where Q.normalizedscore is not null and H.qualified = true \ group by Q.providerid \ order by avgscore DESC").limit(10) # Join with hospitals_qualified to get the hospital name and state # Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-( df_hospitals = sqlContext.table("hospitals_qualified") df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\ select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore) df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc()) # Save it as a table df_top10_hospitals_full.registerTempTable("df") sqlContext.sql("drop table if exists top_10_hospitals") sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df") print print "Top 10 hospitals" print rank = 1
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, StructType, StructField, StringType import re conf = SparkConf().setAppName("spark_sql_regex_specify_schema") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize(["row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"]) pattern = re.compile("(.*) (.*) (.*)") def parse(line): matcher = pattern.match(line) if matcher: return matcher.groups() else: return None columns = source.map(parse).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: (columns[0], columns[1], columns[2])) schema = StructType([StructField("col1", StringType(), False), StructField(
# A simple hive demo. If you do not have a table to load from look run MakeHiveTable.py from pyspark import SparkContext from pyspark.sql import HiveContext import json import sys if __name__ == "__main__": if len(sys.argv) != 3: print "Error usage: LoadHive [sparkmaster] [inputtable]" sys.exit(-1) master = sys.argv[1] inputTable = sys.argv[2] sc = SparkContext(master, "LoadHive") hiveCtx = HiveContext(sc) # Query hive input = hiveCtx.hql("FROM " + inputTable + " SELECT key, value") data = input.map(lambda x: x['key'] * x['key']) result = data.collect() for element in result: print "Got data " + str(element) sc.stop() print "Done!"
tfidf_cosine.write.saveAsTable('{0}.tfidf_sim_all_on_product_name'.format(database_name),mode=modex) tfidf_cosine=sqlContext.sql('select * from {0}.tfidf_sim_all_on_product_name'.format(database_name)) # topn选取 windowy=Window.partitionBy('target_word').orderBy(F.desc('tfidf_similarity')) tfidf_cosine=tfidf_cosine.withColumn('rankx',F.row_number().over(windowy)).filter(F.col('rankx')<=topn).withColumn('rk_start',F.lit(rk_start)).drop('rankx') #tfidf_cosine.count() ; print(dtm.now()-t1) # 逐批写入表 tfidf_cosine.write.saveAsTable('{0}.tfidf_sim_on_product_name'.format(database_name),mode=modex) rk_start=rk_start+rk_step print(dtm.now()-t1) return tfidf_cosine if __name__=='__main__': confx=SparkConf().setAppName('4_word_semantic_similarity_on_prod_name') sc=SparkContext(conf=confx) sqlContext=HiveContext(sc) sc.setLogLevel("WARN") ## 4 tfidf 相似度 # 4.1 生成tfidf 向量,并存储 : 先在独立的py文件中运行了,以便释放内存 # 4.2 计算相似度 建议这里重新开一个 py文件,这样可以释放内存 tfidf_id_vec_df=sqlContext.sql('select * from {0}.term_id_tfidf_vec_in_cat'.format(database_name)) tk_id=sqlContext.sql('select * from {0}.term_id_name'.format(database_name)) tfidf_vec_df=tk_id.join(tfidf_id_vec_df,'term_id','inner').select('term_name','tfidf_vec','tfidf_vec_pos','rankx') tfidf_vec=tfidf_vec_df.toPandas() #import sys #sys.getsizeof(tfidf_vec)/1024.0/1024 sparse_val=params['tfidf']['na_val'] topn=params['tfidf']['topn']
"/var/test/spark-2.0.1/python/lib/pyspark.zip,/var/test/spark-2.0.1/python/lib/py4j-0.10.3-src.zip", ).config( "spark.yarn.dist.archives", "/var/test/spark-2.0.1/R/lib/sparkr.zip#sparkr,hdfs:////user/grp_gdoop_admin/anaconda/anaconda2_env.zip#ANACONDA", ).config("spark.executorEnv.PYTHONPATH", "pyspark.zip:py4j-0.10.3-src.zip").config( "spark.yarn.executor.memoryOverhead", "8192").config( "spark.yarn.queue", "marketing_datascience").config( "spark.sql.warehouse.dir", warehouse_location).enableHiveSupport().getOrCreate()) spark.conf.set("spark.sql.orc.filterPushdown", "true") sc = SparkContext.getOrCreate() sqlContext = HiveContext(sc) # paragraph 2 email function and dates def send_mail(message, to, subject): msg = MIMEText(message) msg["From"] = "[email protected]" msg["To"] = to msg["Subject"] = subject p = Popen(["/usr/sbin/sendmail", "-t", "-oi"], stdin=PIPE) p.communicate(msg.as_string()) today = datetime.date.today() edate = datetime.date.today() - relativedelta(days=2)
# Load config file with open(args.config_file, 'r') as ymlfile: cfg = yaml.safe_load(ymlfile) # load zookeeper paths and start kazooClient to load configs. cfg_zk = cfg["zookeeper"] zk = KazooClient(hosts=cfg_zk["zookeeper_hosts"]) zk.start() # load gucdocs and keywords from the hive. gucdocs_table, stat = zk.get(cfg_zk["gucdocs_loaded_table"]) keywords_table, stat = zk.get(cfg_zk["keywords_table"]) sc = SparkContext.getOrCreate() sc.setLogLevel('WARN') hive_context = HiveContext(sc) command = "SELECT * FROM {}" df_gucdocs = hive_context.sql(command.format(gucdocs_table)) command = "SELECT DISTINCT keyword FROM {}" df_keywords = hive_context.sql(command.format(keywords_table)) keywords = [keyword['keyword'] for keyword in df_keywords.collect()] # load all the es configs for saving gucdocs to es. es_host, stat = zk.get(cfg_zk["es_host"]) es_port, stat = zk.get(cfg_zk["es_port"]) es_index_prefix, stat = zk.get(cfg_zk["es_index_prefix"]) es_index = es_index_prefix + '-' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") es_type, stat = zk.get(cfg_zk["es_type"]) es_starting_day, stat = zk.get(cfg_zk["es_starting_day"])
w.blowing_out = self.isBlowingOut(w.windBearing, float(game.center_azimuth)) try: w.ozone = val.ozone except Exception: w.ozone = self.domeVal['ozone'] print "w=", w updates.append(w) self.saveWeather(updates) print "updates=", updates if __name__ == '__main__': print "Starting.", datetime.now() sc = SparkContext() #sqlContext = SQLContext(sc) sqlContext = HiveContext(sc) games = sqlContext.read.parquet(CreateStatsRDD.rddDir + "/" + Games.table_name + ".parquet") games.registerTempTable("games") games.cache() print "games=", games print games.take(2) stadium = sqlContext.load(source="com.databricks.spark.csv", header="true", path = SRCDIR + "/stadium.csv") stadium.registerTempTable("stadium") stadium.cache() print "stadium=", stadium.take(2) weather = UpdateWeather(sc, sqlContext, games, stadium) weather.update()
# A simple demo for working with SparkSQL and Tweets from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("SparkSQLTwitter") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql( "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row: row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize( [Row(name="holden", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql( "SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
# this is the deprecated equivalent of ADD_JARS add_files = None if os.environ.get("ADD_FILES") is not None: add_files = os.environ.get("ADD_FILES").split(',') if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) sc = SparkContext(pyFiles=add_files) atexit.register(lambda: sc.stop()) try: # Try to access HiveConf, it will raise exception if Hive is not added sc._jvm.org.apache.hadoop.hive.conf.HiveConf() sqlContext = HiveContext(sc) except py4j.protocol.Py4JError: sqlContext = SQLContext(sc) except TypeError: sqlContext = SQLContext(sc) # for compatibility sqlCtx = sqlContext print("""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version %s /_/ """ % sc.version)
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SparkSession from pyspark.sql import HiveContext from pandas import * sc = SparkContext("local[*]", "RentalData") ssc = StreamingContext(sc, 5) ss = SparkSession.builder \ .appName(sc.appName) \ .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \ .config("hive.metastore.uris", "thrift://localhost:9083") \ .enableHiveSupport() \ .getOrCreate() hive_context = HiveContext(sc) df = hive_context.table("default.rentals") df.toPandas().to_csv('mycsv.csv') print(rentals) ssc.start() ssc.awaitTermination() ##spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4 make_map.py
FloatType, DoubleType, DecimalType, StringType, BooleanType, TimestampType, DateType, ArrayType, MapType, ) conf = SparkConf().setAppName("spark_sql_datatype") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize( [ ( int("127"), int("32767"), int("2147483647"), long("9223372036854775807"), float("1.1"), float("2.2"), Decimal("3.3"), "str", bool(0), datetime(2015, 9, 22, 9, 39, 45), date(2015, 9, 22),
def aggregate_tags_count(new_values, total_sum): return sum(new_values) + (total_sum or 0) def get_sql_context_instance(spark_context): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(spark_context) return globals()['sqlContextSingletonInstance'] from pyspark.sql.types import * schema = StructType([]) sql_context = HiveContext(sc) empty = sql_context.createDataFrame(sc.emptyRDD(), schema) def process_rdd(_, rdd): try: # Get spark sql singleton context from the current context sql_context = get_sql_context_instance(rdd.context) # convert the RDD to Row RDD row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1])) # create a DF from the Row RDD hashtags_df = sql_context.createDataFrame(row_rdd) # Register the dataframe as table hashtags_df.registerTempTable("hashtags") # get the top 10 hashtags from the table using SQL and print them