def get_spark_test(): conf = SparkConf() sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") return sc, sql_context
def get_spark(num =4 , cores =4 , mem = "32g"): conf = SparkConf() conf.set("spark.executor.instances", "%d"% num) conf.set("spark.executor.cores", "%d" % cores) conf.set("spark.executor.memory", "%s" % mem) sc = SparkContext(appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex """) sql_context.setConf("spark.sql.shuffle.partitions", "16") return sc, sql_context
def main(): if len(sys.argv) > 2: scriptPath = sys.argv[1] resultPath = sys.argv[2] else: print "Missing Arguments" sc = SparkContext("local", "Test sql queries from pyspark") try: hsc=HiveContext(sc) scriptRaw=str(sc.textFile(scriptPath,use_unicode=False).cache().collect()) print scriptRaw result=open(resultPath,'w') for i in scriptRaw.split(';'): i=i.replace('[\'','') i=i.replace('\']','') print i if not i=="": df=hsc.sql(i.strip()) df.show() def printSeparator(cols): print 'inside print' + str(cols) for j in range(0,cols): print j result.write("+----") result.write("+--+") printHeader=True printFooter=False cols=df.columns print cols for row in df.collect(): print str(row) if printHeader: print str(len(cols)) printSeparator(len(cols)) for col in cols: result.write("| " + col) result.write("|") printSeparator(len(cols)) printHeader=False printFooter=True for v in row: print str(v) result.write("|" + valueToString(v)) result.write("|") if(printFooter): printSeparator(len(cols)) except: sc.stop()
def setUpClass(cls): ReusedPySparkTestCase.setUpClass() cls.tempdir = tempfile.NamedTemporaryFile(delete=False) cls.hive_available = True try: cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf() except py4j.protocol.Py4JError: cls.hive_available = False except TypeError: cls.hive_available = False os.unlink(cls.tempdir.name) if cls.hive_available: cls.spark = HiveContext._createForTesting(cls.sc) cls.testData = [Row(key=i, value=str(i)) for i in range(100)] cls.df = cls.sc.parallelize(cls.testData).toDF()
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_SUBJECT_D003018').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_TRADE_OPPONENT').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_WBK_WXYH_PLOAN').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from post import post_run from ta import mat_close from ta import adx from ml import diff_feature_cls as feature from ml import diff_train_cls_pos_ml as train def main(sc, sql_context, is_hive): #post_run.main(sc, sql_context, is_hive = True) #mat_close.main(sc, sql_context, is_hive = True) #adx.main(sc, sql_context, is_hive = True) feature.main(10, 1, sc, sql_context, is_hive = True) train.main("2010-01-01", "2010-04-30", "2010-05-01", "9999-99-99", sc, sql_context, is_hive=True) if __name__ == "__main__": conf = SparkConf() #conf.set("spark.executor.instances", "4") #conf.set("spark.executor.cores", "4") #conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade_candidate", master="local[2]", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "1") sqlContext.sql("use fex_test") main(sc, sqlContext, is_hive=True)
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_CEN_CBOD_CMCURCUR').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_CUST_SIMILARLST').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
"lateral view explode(a.rowkeys) tab as rowkey ") get_id.cache() users = hc.sql("select rowkey,collect_set(name) name,collect_set(phone) phone ,collect_set(mail) mail " "from new_type.user group by rowkey").alias('a') contacts = hc.sql('select u_uid,collect_set(c_uid) contacts from new_type.contact group by u_uid ') user_compare = get_id.join(users, 'rowkey', 'inner').select('a.rowkey', 'gid', 'name', 'phone', 'mail').join(contacts, 'rowkey', 'inner').select( 'a.rowkey', 'gid', 'name', 'phone', 'mail', 'contacts') def func(iter): result = {} for i in range(0, iter.__len__()): pass user_compare.rdd.groupBy(lambda x:x.gid).map(func) if __name__ == '__main__': # user1 = {'phone': ['123'], 'name': ['zy'], 'mail': ['*****@*****.**'], 'contact_list': ['asd', 'dasdas'] # # } # user2 = {'phone': ['123'], 'name': ['zy', 'zy3'], 'mail': ['*****@*****.**'], 'contact_list': ['dasdas'] # # } # # a = judge_similarity(user1=user1, user2=user2) # print a sc = SparkContext(conf=SparkConf()) hc = HiveContext(sc)
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_ACRM_A_INOUTCOME').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_CCARD').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_R_INCOME_TOP').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_CUST_FAMILY_MEMBER').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
""" wordcount example using the rdd api, we'll write a test for this """ from __future__ import print_function import sys from pyspark import SparkContext from pyspark import HiveContext def do_json_counts(df, target_name): """ count of records where name=target_name in a dataframe with column 'name' """ return df.filter(df.name == target_name).count() if __name__ == "__main__": if len(sys.argv) != 2: sys.exit("Usage: json file}") sc = SparkContext(appName="PythonJsonCount") hc = HiveContext.getOrCreate(sc) df = hc.read.json(sys.argv[1], 1) print("Name vikas found %d times" % do_json_counts(df, 'vikas'))
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_CRE_CUSTR2').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
def save(lp,path, sc): lp.saveAsTextFile(path) def main(sc, sql_context, is_hive = True): df_train = get_train(sc, sql_context, is_hive) df_check = get_check(sc, sql_context, is_hive) lp_train = cal_feature(df_train, 60,3) lp_check = cal_feature(df_check, 60,3) os.system(""" source ~/.bashrc; hadoop fs -rm -r bintrade.ml.diff.label_point.train.cls; hadoop fs -rm -r bintrade.ml.diff.label_point.check.cls """) save(lp_train, "bintrade.ml.diff.label_point.train.cls", sc) save(lp_check, "bintrade.ml.diff.label_point.check.cls", sc) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade.ml.diff_feature", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex """) main(sc, sql_context, is_hive=True) sc.stop()
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_CEN_CBOD_CMMISMIS').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
from pyspark import SparkContext, SparkConf from pyspark import HiveContext ''' Problem statement: find the 5 most expensive orders per person per day using Data Frames ''' conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonRDD") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) #makes sure that the 'retail_db' hive database will be used hiveContext.sql("use retail_db") #loading the data from hive into dataframes orders = hiveContext.sql("select order_id, order_date, order_customer_id from orders") customers = hiveContext.sql("select customer_id, customer_fname, customer_lname from customers") order_items = hiveContext.sql("select order_item_order_id, order_item_subtotal from order_items") #joining the customers with orders on customer_id. Orders and customers are the smaller tables #so I try to join small tables with other small tables before joining to a big table. orders_join_customers = orders.join(customers, orders.order_customer_id == customers.customer_id) #joining on order_id so that I get rows with a customer and their purchases orders_customers_join_order_items = \ orders_join_customers.join(order_items, orders_join_customers.order_id == \ order_items.order_item_order_id) #aggregating by order_date and customer_id with the sum aggregation. #This finds how much a person spent on a single day aggResult = orders_customers_join_order_items.groupBy(['order_date','customer_id']).agg({"order_item_subtotal": "sum"}).withColumnRenamed("sum(order_item_subtotal)", "subtotal_sum") #because in the aggregation the order_date, customer, and sum were generated in a data frame, #I must unfortunatly rejoin to the customers table to display the purchases with names
from pyspark import SparkContext, SparkConf, HiveContext conf = SparkConf().setAppName("most expensive product using SQL") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) sqlString = "SELECT p.product_name, p.product_price \ FROM retail_db.products p \ JOIN (SELECT max(products.product_price) max_id \ FROM retail_db.products) the_max \ ON \ p.product_price = the_max.max_id" result = hiveContext.sql(sqlString) print("***********************\n{0}".format(str(result.take(1))))
#!/usr/bin/env python # -*- coding:utf-8 -*- # author [email protected] import os import sys from pyspark import SQLContext, HiveContext from pyspark import SparkContext local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") def main(sc, sqlContext, isHive = True): dfSymMDate = sqlContext.sql(""" SELECT symbol, max(date) as max, count(date) as c FROM eod2 GROUP BY symbol ORDER BY symbol """) for each in dfSymMDate.collect(): print "%s\t%s\t%d" % (each.symbol, each.max, each.c) if __name__ == "__main__": sc = SparkContext(appName="bintrade_candidate") sqlContext = HiveContext(sc) sqlContext.sql("use fex") main(sc, sqlContext) sc.stop()
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_ZDH_ZZDH_HISTORY_LS').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
from pyspark import SparkConf, SparkContext, HiveContext conf = SparkConf().setAppName("Revenue per category") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) hiveContext.sql("use retail_db") sqlString = "SELECT first_value(cat.category_name), \ round(sum(oi.order_item_subtotal), 2) category_revenue\ FROM categories cat, products prod, order_items oi \ WHERE cat.category_id = prod.product_category_id \ AND \ prod.product_id = oi.order_item_product_id \ GROUP BY cat.category_id \ ORDER BY category_revenue DESC" result = hiveContext.sql(sqlString) collected = result.collect() print "*****************\n{0}".format("\n".join([str(x) for x in collected]))
fg_count = int(sys.argv[2]) print("Parameters%s" % (len(sys.argv) - 1)) while (arguments >= position): print("Parameter %i: %s" % (position, sys.argv[position])) position = position + 1 from pyspark.sql.types import * from pyspark.sql import SparkSession from pyspark import SparkContext, HiveContext import hsfs import numpy as np spark = SparkSession.builder.appName( "create_synthetic_fg").enableHiveSupport().getOrCreate() sqlContext = HiveContext(spark.sparkContext) connection = hsfs.connection() fs = connection.get_feature_store() size = 10 for i in list(range(0, fg_count)): fg_data = [] for j in list(range(1, size)): fg_data.append((j, np.random.normal(), np.random.normal())) fg_col_1 = 'fg' + str(i) + "_col1" fg_col_2 = 'fg' + str(i) + "_col2" fg_name = fg_prefix + str(i) fg_spark_df = spark.createDataFrame(fg_data, ['id', fg_col_1, fg_col_2]) fg_description = "synthetic " + fg_name fg = fs.create_feature_group(fg_name,
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_GROUP_MEMBER').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_CEN_CBOD_LNLNSJRN0').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_SUBJECT_D004009').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
from pyspark import SparkContext, HiveContext sc = SparkContext(appName = "test") sqlContext = HiveContext(sc) sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geometry-api.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-csv-driver-0.80.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-driver-0.80.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-geotools-datastore-0.80.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-raster-driver-0.80.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-share-0.80-jar-with-dependencies.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-util-0.80.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/geowave-vector-driver-0.80.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/gt-api-16.0.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/json-serde-1.3.6.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/oushive.jar") sqlContext.sql("ADD JAR hdfs:///spark/auxjar/spatial-sdk-hadoop.jar") sqlContext.sql("set geowave.server.ip=j11.forcewave.co.kr") sqlContext.sql("set geowave.server.port=54555") df = sqlContext.sql("select gizscore from htmrres limit 100") df.show()
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_LNA_XDXT_IND_INFO').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_ZDH_ZZDH_SHOP').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
from pyspark import SparkContext, SparkConf from pyspark import HiveContext ''' Problem statement: find the 5 most expensive orders per person per day using SparkSQL. ''' conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonSQL") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) #makes sure that the 'retail_db' hive database will be used hiveContext.sql("use retail_db") #firstvalue() must be used because of a bug. #Without it, columns that are not in the group by or the aggregation part cannot be shown. sqlString = "SELECT \ first_value(customers.customer_fname), \ first_value(customers.customer_lname), \ orders.order_date, \ ROUND(SUM(order_items.order_item_subtotal), 2) the_total\ FROM customers, orders, order_items \ WHERE orders.order_id = order_items.order_item_order_id \ AND \ customers.customer_id = orders.order_customer_id \ GROUP BY orders.order_date, customers.customer_id \ ORDER BY the_total DESC" result = hiveContext.sql(sqlString).rdd #rdd used because this is certification practice top_records = result.take(5) print "*****************\n{0}".format(str(top_records))
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_R_FINANCING_TOP').setMaster(sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期
import sys filename = sys.argv[1] saveto = sys.argv[2] jobName = sys.argv[3] allMVs = sys.argv[4] print "=> filename %s" % filename print "=> saveto %s" % saveto print "=> jobName %s" % jobName print "=> allMVs %s" % allMVs conf = SparkConf().setAppName(jobName) sc = SparkContext(conf=conf) sc.setLogLevel("WARN") sqlc = HiveContext(sc) T_XRD_HDFS = sqlc.read.format("json").load(filename) T_XRD_HDFS.registerTempTable("T_XRD_HDFS") print "=> rowsCounter T_XRD_HDFS: %d" % T_XRD_HDFS.count() # T_XRD_HDFS.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_HDFS") T_XRD_RAW_FILE = sqlc.sql("SELECT from_unixtime(end_time, 'yyyy/MM/dd') as TDay, start_time as ots, end_time as cts, file_lfn, client_host, if(server_username = '', 'unknown', server_username) as server_username, (end_time - start_time) as proctime, read_bytes_at_close as readbytes FROM T_XRD_HDFS WHERE (end_time - start_time) > 0 AND read_bytes_at_close > 0 AND `_corrupt_record` IS NULL") T_XRD_RAW_FILE.registerTempTable("T_XRD_RAW_FILE") print "=> rowsCounter T_XRD_RAW_FILE: %d" % T_XRD_RAW_FILE.count() if allMVs == 1: T_XRD_RAW_FILE.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_RAW_FILE") T_XRD_LFC = sqlc.read.format("com.databricks.spark.csv").load("/project/awg/cms/phedex/catalog/csv/merged/").toDF("dataset_name", "dataset_id", "dataset_is_open", "dataset_time_create", "block_name", "block_id", "block_time_create", "block_is_open", "file_lfn", "file_id", "filesize", "usernameXX", "checksum", "file_time_create") T_XRD_LFC.registerTempTable("T_XRD_LFC") print "=> rowsCounter T_XRD_LFC: %d" % T_XRD_LFC.count()
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_F_CI_CUST_RALE').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
from pyspark import SparkConf, SparkContext from pyspark.mllib.regression import LabeledPoint import numpy as np import string from pyspark import HiveContext conf = SparkConf().setMaster('local').setAppName('SparkMLib_FinalProject') sc = SparkContext(conf = conf) RDD = HiveContext(sc).sql('select * from finalproject_merged2') RDD.count() RDD.cache() def get_mapping(rdd, idx): return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap() print "Mapping of the categorical feature column: %s" % get_mapping(RDD, 12) # count from 0 print "Mapping of the categorical feature column: %s" % get_mapping(RDD, 13) # count from 0 print "Mapping of the categorical feature column: %s" % get_mapping(RDD, 14) # count from 0 mappings = [get_mapping(RDD, i) for i in [12,13,14]] def extract_features_dt(record): record_num_vec = [record[1],record[2],record[3],record[4],record[5],record[6],record[7],record[8],record[9],record[10],record[11],record[15],record[16],record[18],record[19]] record_cat_vec = [record[12],record[13],record[14]] # because we cannot directly use record[12,13,14] numvalues = np.array([float(field) for field in record_num_vec]) cat_vec = np.zeros(3) i=0 for field in record_cat_vec: m_countrycat = mappings[i] # instead of directly call get_mapping(record,3), we create dict idx = m_countrycat[field] cat_vec[i]=idx i=i+1 return np.concatenate((numvalues,cat_vec))
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_CUST_ASSIGN_COM_SAVE').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
sqlstr = sqlstr[: len(sqlstr)-2] sqlstr += "\n) stored as orc" print sqlstr sql_context.sql(sqlstr) df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")
#!/usr/bin/env python # -*- coding:utf-8 -*- # author [email protected] import os import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") from pyspark import SQLContext, HiveContext from pyspark import SparkContext import eod if __name__ == "__main__": sc = SparkContext(appName="bintrade_candidate", master="yarn-client") sc.setSystemProperty("spark.driver.memory", "1g") sc.setSystemProperty("spark.executor.memory", "8g") sc.setSystemProperty("spark.executor.cores", "2") sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "16") sqlContext.sql("use fex") eod.run(sc, sqlContext, isHive=True)
sql_context.sql(sqlstr) df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{ "symbol": "AAA", "date": "2010-01-01", "close": 1.0 }, { "symbol": "AAA", "date": "2010-01-01", "close": 1.0 }] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_LNA_XDXT_VILLAGE_INFO').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")
from pyspark.sql import SQLContext, Row from pyspark import SparkContext from pyspark import HiveContext import unittest import os sc = SparkContext("local", "Test sql queries from pyspark") #Change this variable to point to your spak 1.5 example resources examplefiles_path="/Users/gayathrimurali/spark-1.5.1/examples/src/main/resources/" #Test 1: Sqlcontext and Hivecontext are created sqlContext = SQLContext(sc) hivecontext = HiveContext(sc) #Test 2: Read from a parquet file using sql and hive context into a dataframe. Display and do some filter operations on the dataframe df_sql=sqlContext.read.load(examplefiles_path + "users.parquet") df_hive=hivecontext.read.load(examplefiles_path + "users.parquet") df_sql.show() df_hive.show() df_hive.printSchema() df_hive.filter(df_hive['favorite_color']=='red').show() #Test 3: Write selected columns from dataframe into a parquet file if not os.path.exists(examplefiles_path + "nameAndFavColors.parquet"): df_hive.select("name","favorite_color").write.save(examplefiles_path + "nameAndFavColors.parquet")
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_A_SUBJECT_D002021').setMaster(sys.argv[2]) sc = SparkContext(conf = conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d") #10位日期 V_DT10 = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8]))).strftime("%Y-%m-%d") V_STEP = 0
#execfile('/data/w205Project/spark/getLinks.py') <-- don't use. use spark-submit instead. from pyspark import SparkContext, HiveContext sc = SparkContext() sqlContext = HiveContext(sc) from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.types import * from pyspark.sql import functions as F from pyspark.sql.window import Window sqlContext.sql("ADD JAR /data/w205Project/load/hive-serdes-1.0-SNAPSHOT.jar"); # sqlContext.sql("ADD JAR /usr/lib/hadoop/hadoop-aws.jar"); # sqlContext.sql("ADD JAR /usr/lib/hadoop/lib/aws-java-sdk-1.7.14.jar"); ############################################### # EXTRACT ALL THE LINKS INDISCRIMINATELY # ############################################### ''' links = sqlContext.sql("select entities.urls.url[0] as tco, entities.urls.expanded_url[0] as link from tweets where entities.urls.url[0] IS NOT NULL"); uniqueLInks = links.dropDuplicates(['tco', 'link']) uniqueLInks.repartition(1).save("s3n://w205twitterproject/links5","json") ''' ############################################### # ANALYZE # ###############################################
y_pred = lr_model.predict(train_data) y_prob = lr_model.predict_proba(train_data)[:, 1] print("evaluation model") pr = float( np.sum([ 1 if y_pred[i] == train_label[i] else 0 for i in range(len(train_label)) ])) / float(len(train_label)) print("prediction precision: " + str(pr)) def train(rating_file_path, user_file_path, item_file_path, k): data_sample = sample(sc, rating_file_path, user_file_path, item_file_path, k) train_data, train_label = extract_feature_label(data_sample) leaf = gbdt_train(train_data, train_label) leaf_transform = transfromed_feature(leaf, leaf.max()) lr_train(leaf_transform, train_label) if __name__ == "__main__": sc = SparkContext('local', 'traing') sqlcontext = HiveContext(sc) sc.setLogLevel("ERROR") rating_file_path = "E:/data/ml-100k/u.data" user_file_path = "E:/data/ml-100k/u.user" item_file_path = "E:/data/ml-100k/u.item" k = 5 train(rating_file_path, user_file_path, item_file_path, k)
from pyspark import SparkContext, SparkConf from pyspark import HiveContext conf = SparkConf().setAppName("revenueByDaySQL") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) hiveContext.sql("use retail_db") sqlString = "SELECT orders.order_date, \ ROUND(SUM(order_items.order_item_subtotal), 2) the_sum, \ COUNT(DISTINCT orders.order_id) the_count\ FROM orders, order_items \ WHERE orders.order_id = order_items.order_item_order_id \ GROUP BY orders.order_date \ ORDER BY the_sum" joinded_aggregate_data = hiveContext.sql(sqlString) print str(joinded_aggregate_data.take(5))
def load_data(): # load data from files # and return query results / aggregates. hiveContext = HiveContext(sc) # 1027 # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/' # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/' # AMAZON AWS EMR path = 'hdfs:///tmp/files/' #HDFS # new segement files tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \ path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\ path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \ path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ] # small file for debugging # 1027 # tx_files = [path + 's_l_male_30_40_smaller_cities.csv'] # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv'] # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\ # ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\ # ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\ # ,path+'millenials.csv',path+'young_adults.csv'] # 1027 # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\ # ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\ # ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\ # ,path+'l_millenials.csv',path+'l_young_adults.csv'] all_tx = sc.textFile(','.join(tx_files),600) # 1027 # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long' txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long' txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')] txFields[17] = StructField('trans_date', DateType(), True) txSchema = StructType(txFields) # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long txHeader = all_tx.filter(lambda l: "ssn|" in l) txNoHeader = all_tx.subtract(txHeader) temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: ( p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25])) h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema) h_tx_df.registerTempTable("htx") hiveContext.cacheTable("htx") # HBASE CODE HERE # create dataframe with all records # map using hbase_process to extract record into individual componenets # and create a dictionary to store in hbase #h_data = hiveContext.sql("SELECT * FROM htx") #h_data.map(hbase_process).foreachPartition(store_full_data) # get cust mean time between transactions time_lag_eval = hiveContext.sql( "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx order by cc_num, unix_time asc") time_lag_eval.registerTempTable("ts_lag") user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num") user_avg_time.registerTempTable("avg_time") # get cust mean per category mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category") mean_per_cat.registerTempTable("mean_per_cat") # evaluate amount for HML and time of purchase for normal/abnormal test = hiveContext.sql( # # "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") test.registerTempTable("full_table") # evaluate for transaction time (HML) full_data = hiveContext.sql( "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num") full_data.registerTempTable("full_data") # return full tx data for user with reduced HML/AN/HML variables per_cust_transactions = hiveContext.sql( "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc") # return full tx data for profile with reduced HML/NP/HML variables in sorted order #pre_sort_ per_profile_transactions = hiveContext.sql( "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc") #pre_sort_per_profile_transactions.registerTempTable("pre_sort") # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort") # gets pre-computed reference values for each customer and stores in redis # avg spent per category # n transactions # last unix time stamp agg_info = hiveContext.sql( "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat") avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) agg_n_tx = hiveContext.sql( "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num") n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)]) agg_unix_ts = hiveContext.sql( "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num") n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)]) agg_vel_info = hiveContext.sql( "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time") avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) # compile our final string per customer for all tx's per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # compile our final string per profile for all tx's per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # return tx data and aggregates return_dict = {} return_dict['profile'] = per_profile_transactions_r return_dict['customer'] = per_cust_transactions_r return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data
import sys from pyspark import SparkContext, SparkConf, HiveContext, SQLContext if __name__ == '__main__': conf = SparkConf().setAppName("Plotly Exports") sc = SparkContext(conf=conf) hive_context = HiveContext(sc) print '=== Creating Database ===' hive_context.sql('CREATE DATABASE PLOTLY') hive_context.sql('USE PLOTLY') print '=== Creating Table ===' hive_context.sql("CREATE TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010 " "(LOCATION STRING, ALCOHOL FLOAT) ROW FORMAT " "DELIMITED FIELDS TERMINATED BY ',' " "TBLPROPERTIES (\"skip.header.line.count\"=\"1\")") print "=== loading data into table ===" hive_context.sql("LOAD DATA LOCAL INPATH " "'/plotly_datasets/2010_alcohol_consumption_by_country.csv' " "OVERWRITE INTO TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010") sys.exit()
df_customers.show(2) def write_products(): product_sql = " SELECT od.productCode, o.orderDate, SUM(quantityOrdered) AS quantity" \ " FROM myorderdetails od" \ " JOIN myorders o ON od.orderNumber=o.orderNumber " \ " GROUP BY od.productCode, o.orderDate" df_product = hiveContext.sql(product_sql) df_product.show(5) df_product.registerTempTable('myproducts') hiveContext.sql("CREATE DATABASE IF NOT EXISTS " + hive_db) hiveContext.sql("DROP TABLE " + hive_db + "." + hive_table) hiveContext.sql("CREATE TABLE " + hive_db + "." + hive_table + " AS SELECT * FROM myproducts") if __name__ == "__main__": conf = SparkConf().setAppName("Spark Products") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) read_customers(hdfs_customers_path) read_orders(hdfs_orders_path) read_orders_details(hdfs_orders_details_path) write_products()
# author [email protected] import os import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from ml import diff_feature_reg,diff_train def run(sc, sql_context, is_hive): diff_feature_reg.main(sc, sql_context, is_hive = True) diff_train.main(sc, sql_context, is_hive = True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
#coding=UTF-8 from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext from pyspark.sql.types import * from datetime import date, datetime, timedelta import sys, re, os st = datetime.now() conf = SparkConf().setAppName('PROC_O_CEN_CBOD_CRCUPDTL').setMaster( sys.argv[2]) sc = SparkContext(conf=conf) sc.setLogLevel('WARN') if len(sys.argv) > 5: if sys.argv[5] == "hive": sqlContext = HiveContext(sc) else: sqlContext = SQLContext(sc) hdfs = sys.argv[3] dbname = sys.argv[4] #处理需要使用的日期 etl_date = sys.argv[1] #etl日期 V_DT = etl_date #上一日日期 V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) + timedelta(-1)).strftime("%Y%m%d") #月初日期 V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d") #上月末日期 V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) + timedelta(-1)).strftime("%Y%m%d")