def get_spark_test(): conf = SparkConf() sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") return sc, sql_context
def get_spark(num =4 , cores =4 , mem = "32g"): conf = SparkConf() conf.set("spark.executor.instances", "%d"% num) conf.set("spark.executor.cores", "%d" % cores) conf.set("spark.executor.memory", "%s" % mem) sc = SparkContext(appName="youzan-algrithm", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex """) sql_context.setConf("spark.sql.shuffle.partitions", "16") return sc, sql_context
def main(): if len(sys.argv) > 2: scriptPath = sys.argv[1] resultPath = sys.argv[2] else: print "Missing Arguments" sc = SparkContext("local", "Test sql queries from pyspark") try: hsc=HiveContext(sc) scriptRaw=str(sc.textFile(scriptPath,use_unicode=False).cache().collect()) print scriptRaw result=open(resultPath,'w') for i in scriptRaw.split(';'): i=i.replace('[\'','') i=i.replace('\']','') print i if not i=="": df=hsc.sql(i.strip()) df.show() def printSeparator(cols): print 'inside print' + str(cols) for j in range(0,cols): print j result.write("+----") result.write("+--+") printHeader=True printFooter=False cols=df.columns print cols for row in df.collect(): print str(row) if printHeader: print str(len(cols)) printSeparator(len(cols)) for col in cols: result.write("| " + col) result.write("|") printSeparator(len(cols)) printHeader=False printFooter=True for v in row: print str(v) result.write("|" + valueToString(v)) result.write("|") if(printFooter): printSeparator(len(cols)) except: sc.stop()
ACRM_F_AG_AGREEMENT = sqlContext.read.parquet(hdfs+'/ACRM_F_AG_AGREEMENT/*') ACRM_F_AG_AGREEMENT.registerTempTable("ACRM_F_AG_AGREEMENT") #任务[21] 001-01:: V_STEP = V_STEP + 1 sql = """ SELECT CUST_ID as CUST_ID, '' as CUST_NAME, CUST_TYP as CUST_TYP, FR_ID as FR_ID, MIN(START_DATE) as START_DATE_MIN, MAX(END_DATE) as END_DATE, MAX(START_DATE) as START_DATE_MAX, int(COUNT(1)) as COUNT, --INTEGER V_DT as ETL_DATE FROM ACRM_F_AG_AGREEMENT WHERE END_DATE >= V_DT GROUP BY CUST_ID,CUST_TYP,FR_ID """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) ACRM_A_AGREEMENT_TARGET = sqlContext.sql(sql) dfn="ACRM_A_AGREEMENT_TARGET/"+V_DT+".parquet" ACRM_A_AGREEMENT_TARGET.write.save(path=hdfs + '/' + dfn, mode='overwrite') ret = os.system("hdfs dfs -rm -r /"+dbname+"/ACRM_A_AGREEMENT_TARGET/"+V_DT_LD+".parquet") ACRM_F_AG_AGREEMENT.unpersist() et = datetime.now() print("Step %d start[%s] end[%s] use %d seconds") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds)
END )) END as DECIMAL(18,2)) AS CUR_QUARTER_AVG --本年季均 ,CUST_TYP FROM ACRM_F_RE_LENDSUMAVGINFO A WHERE YEAR=V_YEAR AND a.FR_ID is not null AND A.PRDT_CODE IS NOT NULL GROUP BY a.FR_ID,CUST_ID,PRDT_CODE,CUST_TYP """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) sql = re.sub(r"V_MONTH", V_MONTH, sql) sql = re.sub(r"\bV_QUARTER\b", V_QUARTER, sql) sql = re.sub(r"\bV_YEAR\b", "'" + V_YEAR + "'", sql) print(sql) TMP_ACRM_F_CI_GK_LOAN_01 = sqlContext.sql(sql) TMP_ACRM_F_CI_GK_LOAN_01.registerTempTable("TMP_ACRM_F_CI_GK_LOAN_01") dfn = "TMP_ACRM_F_CI_GK_LOAN_01/" + V_DT + ".parquet" TMP_ACRM_F_CI_GK_LOAN_01.cache() nrows = TMP_ACRM_F_CI_GK_LOAN_01.count() TMP_ACRM_F_CI_GK_LOAN_01.write.save(path=hdfs + '/' + dfn, mode='overwrite') TMP_ACRM_F_CI_GK_LOAN_01.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/TMP_ACRM_F_CI_GK_LOAN_01/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert TMP_ACRM_F_CI_GK_LOAN_01 lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows) #任务[21] 001-02::
#任务[11] 001-01:: V_STEP = V_STEP + 1 sql = """ SELECT monotonically_increasing_id() AS ID ,REPORTNO AS REPORTNO ,OBJECTTYPE AS OBJECTTYPE ,OBJECTNO AS OBJECTNO ,MODELNO AS MODELNO ,REPORTNAME AS REPORTNAME ,INPUTTIME AS INPUTTIME ,ORGID AS ORGID ,USERID AS USERID ,UPDATETIME AS UPDATETIME ,VALIDFLAG AS VALIDFLAG ,ODS_ST_DATE AS ODS_ST_DATE FROM F_CI_XDXT_IND_REPORT_RECORD A --个人财务报表数据记录表 """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) OCRM_F_IND_REPORT_RECORD = sqlContext.sql(sql) OCRM_F_IND_REPORT_RECORD.registerTempTable("OCRM_F_IND_REPORT_RECORD") dfn="OCRM_F_IND_REPORT_RECORD/"+V_DT+".parquet" OCRM_F_IND_REPORT_RECORD.cache() nrows = OCRM_F_IND_REPORT_RECORD.count() OCRM_F_IND_REPORT_RECORD.write.save(path=hdfs + '/' + dfn, mode='append') OCRM_F_IND_REPORT_RECORD.unpersist() et = datetime.now() print("Step %d start[%s] end[%s] use %d seconds, insert OCRM_F_IND_REPORT_RECORD lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
,CI_CRT_SCT_N AS CI_CRT_SCT_N ,CI_CRT_OPR AS CI_CRT_OPR ,CI_UPD_SYS AS CI_UPD_SYS ,CI_UPD_OPR AS CI_UPD_OPR ,CI_CRT_ORG AS CI_CRT_ORG ,CI_UPD_ORG AS CI_UPD_ORG ,CI_DB_PART_ID AS CI_DB_PART_ID ,CI_INSTN_COD AS CI_INSTN_COD ,FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'CEN' AS ODS_SYS_ID FROM O_CI_CBOD_CICIFADR A --对私客户地址信息档 """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) F_CI_CBOD_CICIFADR_INNTMP1 = sqlContext.sql(sql) F_CI_CBOD_CICIFADR_INNTMP1.registerTempTable("F_CI_CBOD_CICIFADR_INNTMP1") F_CI_CBOD_CICIFADR = sqlContext.read.parquet(hdfs + '/F_CI_CBOD_CICIFADR/*') F_CI_CBOD_CICIFADR.registerTempTable("F_CI_CBOD_CICIFADR") sql = """ SELECT DST.ETLDT --平台日期:src.ETLDT ,DST.FK_CICIF_KEY --FK_CICIF_KEY:src.FK_CICIF_KEY ,DST.CI_ADDR_COD --地址编码:src.CI_ADDR_COD ,DST.CI_ADDR --地址:src.CI_ADDR ,DST.CI_POSTCOD --邮政编码:src.CI_POSTCOD ,DST.CI_TEL_NO --电话号码:src.CI_TEL_NO ,DST.CI_CNTY_COD --国家代码:src.CI_CNTY_COD ,DST.CI_AREA_COD --地区代码:src.CI_AREA_COD ,DST.CI_SUB_TEL --分机:src.CI_SUB_TEL ,DST.CI_MOBILE_PHONE --移动电话:src.CI_MOBILE_PHONE
,'' AS UPDATEDATE ,A.REMARK AS REMARK ,A.REPORTLOCKED AS REPORTLOCKED ,A.MODELCLASS AS MODELCLASS ,'' AS FARMILYID ,A.FR_ID AS CORPORATEORGID ,A.ODS_ST_DATE AS ODS_ST_DATE ,monotonically_increasing_id() AS ID FROM F_CI_XDXT_CUSTOMER_FSRECORD A --客户财务报表(CF) LEFT JOIN OCRM_F_CI_SYS_RESOURCE B --系统来源中间表 ON A.CUSTOMERID = B.SOURCE_CUST_ID AND A.FR_ID = B.FR_ID AND B.ODS_SYS_ID = 'LNA' """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) OCRM_F_CI_CUSTOMER_FSRECORD = sqlContext.sql(sql) OCRM_F_CI_CUSTOMER_FSRECORD.registerTempTable("OCRM_F_CI_CUSTOMER_FSRECORD") dfn = "OCRM_F_CI_CUSTOMER_FSRECORD/" + V_DT + ".parquet" OCRM_F_CI_CUSTOMER_FSRECORD.cache() nrows = OCRM_F_CI_CUSTOMER_FSRECORD.count() OCRM_F_CI_CUSTOMER_FSRECORD.write.save(path=hdfs + '/' + dfn, mode='overwrite') OCRM_F_CI_CUSTOMER_FSRECORD.unpersist() F_CI_XDXT_CUSTOMER_FSRECORD.unpersist() OCRM_F_CI_SYS_RESOURCE.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/OCRM_F_CI_CUSTOMER_FSRECORD/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert OCRM_F_CI_CUSTOMER_FSRECORD lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
,A.MODULEID AS MODULEID ,A.CHANNELID AS CHANNELID ,A.LOGINTYPE AS LOGINTYPE ,A.USERSEQ AS USERSEQ ,A.DEPTSEQ AS DEPTSEQ ,A.UNIQUEUSERID AS UNIQUEUSERID ,A.USERAGENT AS USERAGENT ,A.CHANNELJNLNO AS CHANNELJNLNO ,A.FR_ID AS FR_ID ,'IBK' AS ODS_SYS_ID ,V_DT AS ODS_ST_DATE FROM O_TX_WSYH_ACCESSLOGMCH A --访问日志表 """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) F_TX_WSYH_ACCESSLOGMCH_INNTMP1 = sqlContext.sql(sql) F_TX_WSYH_ACCESSLOGMCH_INNTMP1.registerTempTable( "F_TX_WSYH_ACCESSLOGMCH_INNTMP1") #F_TX_WSYH_ACCESSLOGMCH = sqlContext.read.parquet(hdfs+'/F_TX_WSYH_ACCESSLOGMCH/*') #F_TX_WSYH_ACCESSLOGMCH.registerTempTable("F_TX_WSYH_ACCESSLOGMCH") sql = """ SELECT DST.ACCESSJNLNO --ACCESSJNLNO:src.ACCESSJNLNO ,DST.ACCESSSTATE --ACCESSSTATE:src.ACCESSSTATE ,DST.ACCESSDATE --ACCESSDATE:src.ACCESSDATE ,DST.MACHINEID --MACHINEID:src.MACHINEID ,DST.ACCESSIP --ACCESSIP:src.ACCESSIP ,DST.MODULEID --模块代号:src.MODULEID ,DST.CHANNELID --渠道编号:src.CHANNELID ,DST.LOGINTYPE --登陆类型:src.LOGINTYPE ,DST.USERSEQ --用户顺序号:src.USERSEQ
args = sys.argv[1:] # 接受参数,run_date、数据库名db_name, 两个参数 # 未传入任何参数,使用默认值,ctprod if len(args) == 0: run_date = compute_date(time.strftime("%Y%m%d"), -1) # 默认run_date db_name = 'ctprod' # 默认数据库名 elif len(args) == 1: run_date = args[0] db_name = 'ctprod' # 默认数据库名 else: run_date = args[0] db_name = args[1] today = run_date ### 分区建表检查 cust_diagnosis_cols = hqlContext.sql( 'desc tag_stat.cust_diagnosis').toPandas() if 'part_date' not in cust_diagnosis_cols.col_name.values: raise ValueError(u'错误!cust_diagnosis表不是分区表!') # hqlContext.sql(''' # create external table if not exists tag_stat.cust_diagnosis # ( # customer_no string, # customer_diagnosis string, # calendar_year string # ) # partitioned by (part_date varchar(8)) # stored as parquet # ''') ### 生成当日客户诊断并插入分区
,A.BONUS_INTG_SUM AS INDEX_VALUE ,SUBSTR(V_DT, 1, 7) AS YEAR_MONTH ,V_DT AS ETL_DATE ,A.CUST_TYPE AS CUST_TYPE ,A.FR_ID AS FR_ID FROM OCRM_F_CI_CUSTLNAINFO A --客户信用信息 WHERE EXISTS (SELECT 1 FROM OCRM_F_CI_CUSTLNAINFO B WHERE A.FR_ID=B.FR_ID AND A.CUST_ID = B.CUST_ID AND A.EXP_DATE =B.EXP_DATE AND B.EXP_DATE > V_DT) """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) ACRM_A_TARGET_D002018 = sqlContext.sql(sql) ACRM_A_TARGET_D002018.registerTempTable("ACRM_A_TARGET_D002018") dfn = "ACRM_A_TARGET_D002018/" + V_DT + ".parquet" ACRM_A_TARGET_D002018.cache() nrows = ACRM_A_TARGET_D002018.count() ACRM_A_TARGET_D002018.write.save(path=hdfs + '/' + dfn, mode='overwrite') ACRM_A_TARGET_D002018.unpersist() OCRM_F_CI_CUSTLNAINFO.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_TARGET_D002018/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_TARGET_D002018 lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
def load_data(): # load data from files # and return query results / aggregates. hiveContext = HiveContext(sc) # 1027 # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/' # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/' # AMAZON AWS EMR path = 'hdfs:///tmp/files/' #HDFS # new segement files tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \ path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\ path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \ path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ] # small file for debugging # 1027 # tx_files = [path + 's_l_male_30_40_smaller_cities.csv'] # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv'] # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\ # ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\ # ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\ # ,path+'millenials.csv',path+'young_adults.csv'] # 1027 # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\ # ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\ # ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\ # ,path+'l_millenials.csv',path+'l_young_adults.csv'] all_tx = sc.textFile(','.join(tx_files),600) # 1027 # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long' txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long' txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')] txFields[17] = StructField('trans_date', DateType(), True) txSchema = StructType(txFields) # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long txHeader = all_tx.filter(lambda l: "ssn|" in l) txNoHeader = all_tx.subtract(txHeader) temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: ( p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25])) h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema) h_tx_df.registerTempTable("htx") hiveContext.cacheTable("htx") # HBASE CODE HERE # create dataframe with all records # map using hbase_process to extract record into individual componenets # and create a dictionary to store in hbase #h_data = hiveContext.sql("SELECT * FROM htx") #h_data.map(hbase_process).foreachPartition(store_full_data) # get cust mean time between transactions time_lag_eval = hiveContext.sql( "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx order by cc_num, unix_time asc") time_lag_eval.registerTempTable("ts_lag") user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num") user_avg_time.registerTempTable("avg_time") # get cust mean per category mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category") mean_per_cat.registerTempTable("mean_per_cat") # evaluate amount for HML and time of purchase for normal/abnormal test = hiveContext.sql( # # "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") test.registerTempTable("full_table") # evaluate for transaction time (HML) full_data = hiveContext.sql( "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num") full_data.registerTempTable("full_data") # return full tx data for user with reduced HML/AN/HML variables per_cust_transactions = hiveContext.sql( "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc") # return full tx data for profile with reduced HML/NP/HML variables in sorted order #pre_sort_ per_profile_transactions = hiveContext.sql( "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc") #pre_sort_per_profile_transactions.registerTempTable("pre_sort") # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort") # gets pre-computed reference values for each customer and stores in redis # avg spent per category # n transactions # last unix time stamp agg_info = hiveContext.sql( "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat") avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) agg_n_tx = hiveContext.sql( "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num") n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)]) agg_unix_ts = hiveContext.sql( "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num") n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)]) agg_vel_info = hiveContext.sql( "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time") avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) # compile our final string per customer for all tx's per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # compile our final string per profile for all tx's per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # return tx data and aggregates return_dict = {} return_dict['profile'] = per_profile_transactions_r return_dict['customer'] = per_cust_transactions_r return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data
from pyspark import SparkContext, SparkConf from pyspark import HiveContext ''' Problem statement: find the 5 most expensive orders per person per day using SparkSQL. ''' conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonSQL") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) #makes sure that the 'retail_db' hive database will be used hiveContext.sql("use retail_db") #firstvalue() must be used because of a bug. #Without it, columns that are not in the group by or the aggregation part cannot be shown. sqlString = "SELECT \ first_value(customers.customer_fname), \ first_value(customers.customer_lname), \ orders.order_date, \ ROUND(SUM(order_items.order_item_subtotal), 2) the_total\ FROM customers, orders, order_items \ WHERE orders.order_id = order_items.order_item_order_id \ AND \ customers.customer_id = orders.order_customer_id \ GROUP BY orders.order_date, customers.customer_id \ ORDER BY the_total DESC" result = hiveContext.sql(sqlString).rdd #rdd used because this is certification practice top_records = result.take(5) print "*****************\n{0}".format(str(top_records))
sql = """ SELECT CUST_ID as CUST_ID, CUST_TYP as CUST_TYP, FR_ID, CHANNEL_FLAG as CHANNEL_FLAG, CAST(COUNT(1) AS DECIMAL(22,2)) as INDEX_VALUE_C, --DECIMAL(22,2) CAST(COALESCE(SUM(SA_DR_AMT),0) - COALESCE(SUM(SA_CR_AMT),0) AS DECIMAL(22,2)) as SA_AMT, CAST(SUM(SA_TVARCHAR_AMT) AS DECIMAL(22,2)) as SA_TX_AMT, '' as SA_TX_DT, V_DT10 as ETL_DATE FROM ACRM_F_CI_NIN_TRANSLOG A WHERE --A.FR_ID = V_FR_ID A.SA_TVARCHAR_DT = V_DT GROUP BY FR_ID,CUST_ID,CUST_TYP,CHANNEL_FLAG """ sql = re.sub(r"\bV_DT\b", "'"+V_DT+"'", sql) sql = re.sub(r"\bV_DT10\b", "'"+V_DT10+"'", sql) ACRM_A_TRANSLOG_TARGET = sqlContext.sql(sql) ACRM_A_TRANSLOG_TARGET.registerTempTable("ACRM_A_TRANSLOG_TARGET") dfn="ACRM_A_TRANSLOG_TARGET/"+V_DT+".parquet" ACRM_A_TRANSLOG_TARGET.cache() nrows = ACRM_A_TRANSLOG_TARGET.count() ACRM_A_TRANSLOG_TARGET.write.save(path=hdfs + '/' + dfn, mode='overwrite') ACRM_A_TRANSLOG_TARGET.unpersist() #全量表保存后需要删除前一天数据 ret = os.system("hdfs dfs -rm -r /"+dbname+"/ACRM_A_TRANSLOG_TARGET/"+V_DT_LD+".parquet") et = datetime.now() print("Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_TRANSLOG_TARGET lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
sqlstr = sqlstr[: len(sqlstr)-2] sqlstr += "\n) stored as orc" print sqlstr sql_context.sql(sqlstr) df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")
def save(lp,path, sc): lp.saveAsTextFile(path) def main(sc, sql_context, is_hive = True): df_train = get_train(sc, sql_context, is_hive) df_check = get_check(sc, sql_context, is_hive) lp_train = cal_feature(df_train, 60,3) lp_check = cal_feature(df_check, 60,3) os.system(""" source ~/.bashrc; hadoop fs -rm -r bintrade.ml.diff.label_point.train.cls; hadoop fs -rm -r bintrade.ml.diff.label_point.check.cls """) save(lp_train, "bintrade.ml.diff.label_point.train.cls", sc) save(lp_check, "bintrade.ml.diff.label_point.check.cls", sc) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade.ml.diff_feature", conf=conf) sql_context = HiveContext(sc) sql_context.sql(""" use fex """) main(sc, sql_context, is_hive=True) sc.stop()
CREATE TABLE IF NOT EXISTS %s( symbol string, date1 string, date2 string, date3 string, close1 float, spxopen1 float, close2 float, spxopen2 float, act_diff float, prob float, label float ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' """ % "check_pred_ext") df.insertInto("check_pred_ext", overwrite = True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "8") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName=__file__, conf=conf) sql_context = HiveContext(sc) sql_context.setConf("spark.sql.shuffle.partitions", "32") sql_context.sql("use fex") main(sc, sql_context) sc.stop()
sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from post import post_run from ta import mat_close from ta import adx from ml import diff_feature_cls as feature from ml import diff_train_cls_pos_ml as train def main(sc, sql_context, is_hive): #post_run.main(sc, sql_context, is_hive = True) #mat_close.main(sc, sql_context, is_hive = True) #adx.main(sc, sql_context, is_hive = True) feature.main(10, 1, sc, sql_context, is_hive = True) train.main("2010-01-01", "2010-04-30", "2010-05-01", "9999-99-99", sc, sql_context, is_hive=True) if __name__ == "__main__": conf = SparkConf() #conf.set("spark.executor.instances", "4") #conf.set("spark.executor.cores", "4") #conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade_candidate", master="local[2]", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "1") sqlContext.sql("use fex_test") main(sc, sqlContext, is_hive=True)
print "=> filename %s" % filename print "=> saveto %s" % saveto print "=> jobName %s" % jobName print "=> allMVs %s" % allMVs conf = SparkConf().setAppName(jobName) sc = SparkContext(conf=conf) sc.setLogLevel("WARN") sqlc = HiveContext(sc) T_XRD_HDFS = sqlc.read.format("json").load(filename) T_XRD_HDFS.registerTempTable("T_XRD_HDFS") print "=> rowsCounter T_XRD_HDFS: %d" % T_XRD_HDFS.count() # T_XRD_HDFS.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_HDFS") T_XRD_RAW_FILE = sqlc.sql("SELECT from_unixtime(end_time, 'yyyy/MM/dd') as TDay, start_time as ots, end_time as cts, file_lfn, client_host, if(server_username = '', 'unknown', server_username) as server_username, (end_time - start_time) as proctime, read_bytes_at_close as readbytes FROM T_XRD_HDFS WHERE (end_time - start_time) > 0 AND read_bytes_at_close > 0 AND `_corrupt_record` IS NULL") T_XRD_RAW_FILE.registerTempTable("T_XRD_RAW_FILE") print "=> rowsCounter T_XRD_RAW_FILE: %d" % T_XRD_RAW_FILE.count() if allMVs == 1: T_XRD_RAW_FILE.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_RAW_FILE") T_XRD_LFC = sqlc.read.format("com.databricks.spark.csv").load("/project/awg/cms/phedex/catalog/csv/merged/").toDF("dataset_name", "dataset_id", "dataset_is_open", "dataset_time_create", "block_name", "block_id", "block_time_create", "block_is_open", "file_lfn", "file_id", "filesize", "usernameXX", "checksum", "file_time_create") T_XRD_LFC.registerTempTable("T_XRD_LFC") print "=> rowsCounter T_XRD_LFC: %d" % T_XRD_LFC.count() if allMVs == 1: T_XRD_LFC.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_LFC") V_XRD_LFC_aggr1 = sqlc.sql("SELECT distinct block_name, dataset_name FROM T_XRD_LFC") V_XRD_LFC_aggr1.registerTempTable("V_XRD_LFC_aggr1") print "=> rowsCounter V_XRD_LFC_aggr1: %d" % V_XRD_LFC_aggr1.count() if allMVs == 1:
V_STEP = 0 O_CI_WSYH_EGROUPRELATION = sqlContext.read.parquet(hdfs+'/O_CI_WSYH_EGROUPRELATION/*') O_CI_WSYH_EGROUPRELATION.registerTempTable("O_CI_WSYH_EGROUPRELATION") #任务[21] 001-01:: V_STEP = V_STEP + 1 sql = """ SELECT A.CIFSEQ AS CIFSEQ ,A.ASSOCIFSEQ AS ASSOCIFSEQ ,A.ASSOTYPE AS ASSOTYPE ,A.ASSOSTATE AS ASSOSTATE ,A.FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'IBK' AS ODS_SYS_ID FROM O_CI_WSYH_EGROUPRELATION A --集团关系表 """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) F_CI_WSYH_EGROUPRELATION = sqlContext.sql(sql) F_CI_WSYH_EGROUPRELATION.registerTempTable("F_CI_WSYH_EGROUPRELATION") dfn="F_CI_WSYH_EGROUPRELATION/"+V_DT+".parquet" F_CI_WSYH_EGROUPRELATION.cache() nrows = F_CI_WSYH_EGROUPRELATION.count() F_CI_WSYH_EGROUPRELATION.write.save(path=hdfs + '/' + dfn, mode='overwrite') F_CI_WSYH_EGROUPRELATION.unpersist() ret = os.system("hdfs dfs -rm -r /"+dbname+"/F_CI_WSYH_EGROUPRELATION/"+V_DT_LD+".parquet") et = datetime.now() print("Step %d start[%s] end[%s] use %d seconds, insert F_CI_WSYH_EGROUPRELATION lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
from pyspark import SparkContext, SparkConf from pyspark import HiveContext ''' Problem statement: find the 5 most expensive orders per person per day using Data Frames ''' conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonRDD") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) #makes sure that the 'retail_db' hive database will be used hiveContext.sql("use retail_db") #loading the data from hive into dataframes orders = hiveContext.sql("select order_id, order_date, order_customer_id from orders") customers = hiveContext.sql("select customer_id, customer_fname, customer_lname from customers") order_items = hiveContext.sql("select order_item_order_id, order_item_subtotal from order_items") #joining the customers with orders on customer_id. Orders and customers are the smaller tables #so I try to join small tables with other small tables before joining to a big table. orders_join_customers = orders.join(customers, orders.order_customer_id == customers.customer_id) #joining on order_id so that I get rows with a customer and their purchases orders_customers_join_order_items = \ orders_join_customers.join(order_items, orders_join_customers.order_id == \ order_items.order_item_order_id) #aggregating by order_date and customer_id with the sum aggregation. #This finds how much a person spent on a single day aggResult = orders_customers_join_order_items.groupBy(['order_date','customer_id']).agg({"order_item_subtotal": "sum"}).withColumnRenamed("sum(order_item_subtotal)", "subtotal_sum") #because in the aggregation the order_date, customer, and sum were generated in a data frame, #I must unfortunatly rejoin to the customers table to display the purchases with names
from pyspark import SparkConf, SparkContext, HiveContext conf = SparkConf().setAppName("Revenue per category") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) hiveContext.sql("use retail_db") sqlString = "SELECT first_value(cat.category_name), \ round(sum(oi.order_item_subtotal), 2) category_revenue\ FROM categories cat, products prod, order_items oi \ WHERE cat.category_id = prod.product_category_id \ AND \ prod.product_id = oi.order_item_product_id \ GROUP BY cat.category_id \ ORDER BY category_revenue DESC" result = hiveContext.sql(sqlString) collected = result.collect() print "*****************\n{0}".format("\n".join([str(x) for x in collected]))
df_hive=hivecontext.read.load(examplefiles_path + "users.parquet") df_sql.show() df_hive.show() df_hive.printSchema() df_hive.filter(df_hive['favorite_color']=='red').show() #Test 3: Write selected columns from dataframe into a parquet file if not os.path.exists(examplefiles_path + "nameAndFavColors.parquet"): df_hive.select("name","favorite_color").write.save(examplefiles_path + "nameAndFavColors.parquet") #Test 4: Save dataframe as persistent hive table using hivecontext hivecontext.sql("DROP TABLE IF EXISTS users") df_hive.write.saveAsTable("users") #Test 5: Read from the hive table into a parquet file colorRed=hivecontext.sql("SELECT * FROM users WHERE favorite_color=='red' ") colorRed.show() if not os.path.exists(examplefiles_path + "red.parquet"): colorRed.write.parquet(examplefiles_path + "red.parquet") #Test 6: Create Parquet hive table. Read data from a parquet file and store it in the table filepath=examplefiles_path + "users.parquet" load_query="LOAD DATA LOCAL INPATH '"+filepath+"'OVERWRITE INTO TABLE parquetTests" hivecontext.sql("CREATE TABLE IF NOT EXISTS parquetTests(name STRING,favorite_color STRING,favorite_numbers ARRAY<INT>) STORED AS PARQUET") hivecontext.sql(load_query) results=hivecontext.sql("SELECT * FROM parquetTests")
,PD_DEP_TYP_DESC AS PD_DEP_TYP_DESC ,PD_ODRW_CASH_AMT AS PD_ODRW_CASH_AMT ,PD_TDRW_CASH_AMT AS PD_TDRW_CASH_AMT ,PD_IDEP_CASH_AMT AS PD_IDEP_CASH_AMT ,PD_IDEP_TRAN_AMT AS PD_IDEP_TRAN_AMT ,PD_IDRW_CASH_AMT AS PD_IDRW_CASH_AMT ,PD_IDRW_TRAN_AMT AS PD_IDRW_TRAN_AMT ,PD_TRAN_FLG AS PD_TRAN_FLG ,PD_INFDRW_SAME_FLG AS PD_INFDRW_SAME_FLG ,FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'CEN' AS ODS_SYS_ID FROM O_CM_CBOD_PDPRTTDP A --定期利息部件 """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) F_CM_CBOD_PDPRTTDP = sqlContext.sql(sql) F_CM_CBOD_PDPRTTDP.registerTempTable("F_CM_CBOD_PDPRTTDP") dfn = "F_CM_CBOD_PDPRTTDP/" + V_DT + ".parquet" F_CM_CBOD_PDPRTTDP.cache() nrows = F_CM_CBOD_PDPRTTDP.count() F_CM_CBOD_PDPRTTDP.write.save(path=hdfs + '/' + dfn, mode='overwrite') F_CM_CBOD_PDPRTTDP.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/F_CM_CBOD_PDPRTTDP/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert F_CM_CBOD_PDPRTTDP lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
from textblob import TextBlob from unidecode import unidecode import csv import json # Note - SparkContext available as sc, HiveContext available as sqlCtx. from pyspark import SparkContext from pyspark import HiveContext from pyspark.streaming import StreamingContext sc = SparkContext(appName="AnaliseSentimentosTwitter") sqlCtx = HiveContext(sc) tweets = sqlCtx.sql("select text from trabalho_translated_tweets") #Contadores numPos = 0 numNeg = 0 numNul = 0 total = 0 tweetTexts = tweets.map(lambda p: "text: " + p.text) #deve ser substituido pela qtd de tweetTexts. Foi colocado assim apenas para testes. qtd = len(tweetTexts.collect()) for text in tweetTexts.collect(): # analisando o sentimento sentiment = TextBlob(text)
ON A.ID = B.CUST_BASE_ID AND B.FR_ID = A.FR_ID LEFT JOIN ACRM_F_DP_SAVE_INFO C --负债协议 ON B.CUST_ID = C.CUST_ID AND C.ACCT_STATUS = '01' AND C.FR_ID = A.FR_ID LEFT JOIN ACRM_F_CI_ASSET_BUSI_PROTO D --资产协议 ON B.CUST_ID = D.CUST_ID AND D.LN_APCL_FLG = 'N' AND D.FR_ID = A.FR_ID GROUP BY A.FR_ID ,A.ID ,A.CUST_BASE_CREATE_NAME ,A.CUST_BASE_CREATE_ORG """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) ACRM_A_BASE_DEP_CRE = sqlContext.sql(sql) ACRM_A_BASE_DEP_CRE.registerTempTable("ACRM_A_BASE_DEP_CRE") dfn = "ACRM_A_BASE_DEP_CRE/" + V_DT + ".parquet" ACRM_A_BASE_DEP_CRE.cache() nrows = ACRM_A_BASE_DEP_CRE.count() ACRM_A_BASE_DEP_CRE.write.save(path=hdfs + '/' + dfn, mode='append') ACRM_A_BASE_DEP_CRE.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_BASE_DEP_CRE/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_BASE_DEP_CRE lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
from pyspark.sql.functions import lit, row_number, monotonically_increasing_id, isnan, when, count, col, avg, udf from pyspark.sql import functions as F from pyspark.sql.types import TimestampType conf = SparkConf().setMaster("yarn").set('spark.sql.warehouse.dir', '/user/hive/warehouse') sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) print('1. Cleaning Data') print('1.1 Importing Raw Data') In_Df = sqlContext.read.csv('/Input/loan.csv', header=True, inferSchema=True) #Create the database for working with sqlContext.sql("""create database work_db""") #Add an ID variable print('1.1 Creating ID variable') In_Df = In_Df.withColumn('id', monotonically_increasing_id()) In_Df = In_Df.drop('member_id') #Drop Member ID because it is Blank #Drop columns we dont need print('1.2 Dropping unneccesary columns') In_Df = In_Df.drop('url') #A cryptic column with only one value In_Df = In_Df.drop( 'desc') # A Text column, not much can be extracted from this In_Df = In_Df.drop('zip_code') #Unrelated to default rates #Create binary default variable print('1.3 Creating binary Default variable')
.toDF(["sku","query"]).registerTempTable("tmp_table") insert_sql = """ insert overwrite table {table_name} partition(dt='{dt}') select * from tmp_table """.format(table_name=table_name, dt=dt_str) print("insert_sql:\n" + insert_sql) hiveCtx.sql(insert_sql) if __name__ == "__main__": conf = SparkConf() sc = SparkContext(conf=conf, appName="sp_ind") sc.setLogLevel("WARN") hiveCtx = HiveContext(sc) hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true') hiveCtx.setConf('spark.shuffle.memoryFraction', '0.4') hiveCtx.setConf('spark.sql.shuffle.partitions', '1000') if len(sys.argv) == 1: dt = datetime.datetime.now() + datetime.timedelta(-1) else: dt = datetime.datetime.strptime(sys.argv[1], "%Y%m%d").date() dt_str = dt.strftime("%Y-%m-%d") yest_dt = dt + datetime.timedelta(-30) yest_str = yest_dt.strftime("%Y-%m-%d") hiveCtx.sql("use app") create_table(hiveCtx) getQuery(hiveCtx)
,V_DT AS ODS_ST_DATE ,A.ALL_COUNT AS ALL_COUNT ,A.ALL_MONEY AS OLD_YEAR_BAL_SUM ,CAST(A.ALL_COUNT AS DECIMAL(24,6)) AS OLD_YEAR_BAL ,A.AMOUNT AS AMOUNT ,A.COUNT AS COUNT ,A.LAST_COUNT AS LAST_COUNT ,B.FR_ID AS FR_ID FROM ACRM_F_RE_INSUSUMINFO A --保险账户累计表 LEFT JOIN ADMIN_AUTH_ORG B --机构表 ON A.ORG_NO = B.ORG_ID WHERE A.YEAR = SUBSTR(V_DT,1,4) - 1 AND V_DT = CONCAT(SUBSTR(V_DT,1,4),'-01-01') """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) ACRM_F_RE_INSUSUMINFO = sqlContext.sql(sql) ACRM_F_RE_INSUSUMINFO.registerTempTable("ACRM_F_RE_INSUSUMINFO") dfn = "ACRM_F_RE_INSUSUMINFO/" + V_DT + ".parquet" ACRM_F_RE_INSUSUMINFO.cache() nrows = ACRM_F_RE_INSUSUMINFO.count() ACRM_F_RE_INSUSUMINFO.write.save(path=hdfs + '/' + dfn, mode='append') ACRM_F_RE_INSUSUMINFO.unpersist() et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert ACRM_F_RE_INSUSUMINFO lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows) #任务[12] 001-02:: V_STEP = V_STEP + 1
sql = """ SELECT BRNO AS BRNO ,BANKNOTYPE AS BANKNOTYPE ,STATUS AS STATUS ,BANKNO AS BANKNO ,OBANKNO AS OBANKNO ,EFCTDAT AS EFCTDAT ,NOTE1 AS NOTE1 ,NOTE2 AS NOTE2 ,FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'AFA' AS ODS_SYS_ID FROM O_CM_AFA_BRNOBANKMAP A --机构行号对应表 """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) F_CM_AFA_BRNOBANKMAP = sqlContext.sql(sql) F_CM_AFA_BRNOBANKMAP.registerTempTable("F_CM_AFA_BRNOBANKMAP") dfn = "F_CM_AFA_BRNOBANKMAP/" + V_DT + ".parquet" F_CM_AFA_BRNOBANKMAP.cache() nrows = F_CM_AFA_BRNOBANKMAP.count() F_CM_AFA_BRNOBANKMAP.write.save(path=hdfs + '/' + dfn, mode='overwrite') F_CM_AFA_BRNOBANKMAP.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/F_CM_AFA_BRNOBANKMAP/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert F_CM_AFA_BRNOBANKMAP lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
,V_DT AS ODS_ST_DATE ,A.FR_ID AS FR_ID FROM OCRM_F_CI_CUST_DESC A --统一客户信息 LEFT JOIN ACRM_F_CI_CUST_CONTRIBUTION B -- ON A.CUST_ID = B.CUST_ID AND A.FR_ID = B.FR_ID AND B.ODS_DATE = V_DT LEFT JOIN ACRM_F_CI_LOYALTY_INFO C -- ON A.CUST_ID = C.CUST_ID AND A.FR_ID = C.FR_ID AND C.ETL_DATE = V_DT GROUP BY A.CUST_ID ,A.OBJ_RATING ,A.SUB_RATING ,A.FR_ID """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) ACRM_A_ANALYSIS_INFO = sqlContext.sql(sql) ACRM_A_ANALYSIS_INFO.registerTempTable("ACRM_A_ANALYSIS_INFO") dfn="ACRM_A_ANALYSIS_INFO/"+V_DT+".parquet" ACRM_A_ANALYSIS_INFO.cache() nrows = ACRM_A_ANALYSIS_INFO.count() ACRM_A_ANALYSIS_INFO.write.save(path=hdfs + '/' + dfn, mode='overwrite') ACRM_A_ANALYSIS_INFO.unpersist() OCRM_F_CI_CUST_DESC.unpersist() ACRM_F_CI_CUST_CONTRIBUTION.unpersist() ACRM_F_CI_LOYALTY_INFO.unpersist() ret = os.system("hdfs dfs -rm -r /"+dbname+"/ACRM_A_ANALYSIS_INFO/"+V_DT_LD+".parquet") et = datetime.now() print("Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_ANALYSIS_INFO lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
import os import warnings warnings.filterwarnings('ignore') import pyspark.sql.functions as F import pandas as pd import tensorflow as tf os.environ['SPARK_HOME'] = "C:\Software\spark\spark-2.4.4-bin-hadoop2.7" os.environ['PYSPARK_PYTHON'] = "D:\Python_Tools\Anaconda\python.exe" sparkConf = SparkConf().setMaster("local[*]").setAppName("ContentRecall").set( "spark.executor.memory", "1g") sc = SparkContext(conf=sparkConf) sqlContext = HiveContext(sc) sqlContext.sql("use profile") user_article_basic = sqlContext.sql( "select user_id, article_id, clicked from user_article_basic") user_profile = sqlContext.sql( "select user_id, topic, weights from user_profile_topic") user_profile = user_profile.groupBy(['user_id']).agg( F.collect_list('weights')).withColumnRenamed('collect_list(weights)', 'user_weights') train_data = user_article_basic.join(user_profile, on=['user_id'], how='left') sqlContext.sql("use article") article_vector = sqlContext.sql("select * from article_vector") train_data = train_data.join(article_vector, on=['article_id'], how='left') # train_data.show()
,A.ISRELATVIE AS ISRELATVIE ,A.CUSTOMERCLASSIFY AS CUSTOMERCLASSIFY ,A.CUSTOMERCLASSIFYDATE AS CUSTOMERCLASSIFYDATE ,A.ENTINDUSTRIALSCALE AS ENTINDUSTRIALSCALE ,A.ENTRETAILSCALE AS ENTRETAILSCALE ,A.ORGCREDITCODE AS ORGCREDITCODE ,A.GROUPTYPE AS GROUPTYPE ,A.CORPIDASDATE AS CORPIDASDATE ,A.FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'LNA' AS ODS_SYS_ID FROM O_CI_XDXT_ENT_INFO A --企业基本信息 """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) F_CI_XDXT_ENT_INFO_INNTMP1 = sqlContext.sql(sql) F_CI_XDXT_ENT_INFO_INNTMP1.registerTempTable("F_CI_XDXT_ENT_INFO_INNTMP1") #F_CI_XDXT_ENT_INFO = sqlContext.read.parquet(hdfs+'/F_CI_XDXT_ENT_INFO/*') #F_CI_XDXT_ENT_INFO.registerTempTable("F_CI_XDXT_ENT_INFO") sql = """ SELECT DST.CUSTOMERID --客户编号:src.CUSTOMERID ,DST.CORPID --法人或组织机构代码:src.CORPID ,DST.ENTERPRISENAME --企业名称:src.ENTERPRISENAME ,DST.ENGLISHNAME --客户英文名称:src.ENGLISHNAME ,DST.FICTITIOUSPERSON --法人代表:src.FICTITIOUSPERSON ,DST.ORGNATURE --机构类型:src.ORGNATURE ,DST.FINANCETYPE --金融机构类型:src.FINANCETYPE ,DST.ENTERPRISEBELONG --企业隶属:src.ENTERPRISEBELONG ,DST.INDUSTRYTYPE --行业类型:src.INDUSTRYTYPE ,DST.INDUSTRYTYPE1 --行业类型1:src.INDUSTRYTYPE1
AND B.INDEX_VALUE < C.INDEX_VALUE THEN 0 ELSE 1 END AS INDEX_VALUE ,SUBSTR(V_DT, 1, 7) AS YEAR_MONTH ,V_DT AS ETL_DATE ,'1' AS CUST_TYPE ,A.FR_ID AS FR_ID FROM ACRM_A_TARGET_D004006 A --2个月前持有产品数 INNER JOIN ACRM_A_TARGET_D004007 B --1个月前持有产品数 ON A.CUST_ID = B.CUST_ID AND A.FR_ID = B.FR_ID INNER JOIN ACRM_A_TARGET_D004008 C --当月持有产品数 ON A.CUST_ID = C.CUST_ID AND A.FR_ID = C.FR_ID WHERE A.CUST_TYPE = '1' """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) ACRM_A_TARGET_D004029 = sqlContext.sql(sql) ACRM_A_TARGET_D004029.registerTempTable("ACRM_A_TARGET_D004029") dfn = "ACRM_A_TARGET_D004029/" + V_DT + ".parquet" ACRM_A_TARGET_D004029.cache() nrows = ACRM_A_TARGET_D004029.count() ACRM_A_TARGET_D004029.write.save(path=hdfs + '/' + dfn, mode='overwrite') ACRM_A_TARGET_D004029.unpersist() ACRM_A_TARGET_D004006.unpersist() ACRM_A_TARGET_D004007.unpersist() ACRM_A_TARGET_D004008.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_TARGET_D004029/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_TARGET_D004029 lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
,A.FORESIGHT AS FORESIGHT ,A.STATUS AS STATUS ,A.INDUSTRY AS INDUSTRY ,A.PROSECUTION AS PROSECUTION ,A.FIRSTINVESTSUM AS FIRSTINVESTSUM ,A.FIRSTINVESTDATE AS FIRSTINVESTDATE ,A.LASTINVESTSUM AS LASTINVESTSUM ,A.LASTINVESTDATE AS LASTINVESTDATE ,A.DEADLINE AS DEADLINE ,A.FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'LNA' AS ODS_SYS_ID FROM O_CI_XDXT_CUSTOMER_RELATIVE A --客户关联信息 """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) F_CI_XDXT_CUSTOMER_RELATIVE = sqlContext.sql(sql) F_CI_XDXT_CUSTOMER_RELATIVE.registerTempTable("F_CI_XDXT_CUSTOMER_RELATIVE") dfn = "F_CI_XDXT_CUSTOMER_RELATIVE/" + V_DT + ".parquet" F_CI_XDXT_CUSTOMER_RELATIVE.cache() nrows = F_CI_XDXT_CUSTOMER_RELATIVE.count() F_CI_XDXT_CUSTOMER_RELATIVE.write.save(path=hdfs + '/' + dfn, mode='overwrite') F_CI_XDXT_CUSTOMER_RELATIVE.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/F_CI_XDXT_CUSTOMER_RELATIVE/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert F_CI_XDXT_CUSTOMER_RELATIVE lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
,ORG_ID ,CUST_ID ,CUST_ZH_NAME ,CUS_CONTRIBUTION_RMB ,CRE_CONTRIBUTION_RMB ,DEP_CONTRIBUTION_RMB ,DEP_AMOUNT_RMB ,CRE_AMOUNT_RMB ,MAIN_TYPE ,REPORT_DATE ,CUST_TYP ,MGR_ID ,MGR_NAME ,M_MAIN_TYPE FROM MCRM_COR_CUS_CON_ORG_REPORT_BE A -- """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) MCRM_COR_CUS_CON_ORG_REPORT = sqlContext.sql(sql) MCRM_COR_CUS_CON_ORG_REPORT.registerTempTable("MCRM_COR_CUS_CON_ORG_REPORT") dfn = "MCRM_COR_CUS_CON_ORG_REPORT/" + V_DT + ".parquet" MCRM_COR_CUS_CON_ORG_REPORT.cache() nrows = MCRM_COR_CUS_CON_ORG_REPORT.count() MCRM_COR_CUS_CON_ORG_REPORT.write.save(path=hdfs + '/' + dfn, mode='overwrite') MCRM_COR_CUS_CON_ORG_REPORT.unpersist() MCRM_COR_CUS_CON_ORG_REPORT_BE.unpersist() et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert MCRM_COR_CUS_CON_ORG_REPORT lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
#execfile('/data/w205Project/spark/getLinks.py') <-- don't use. use spark-submit instead. from pyspark import SparkContext, HiveContext sc = SparkContext() sqlContext = HiveContext(sc) from pyspark.sql.functions import UserDefinedFunction from pyspark.sql.types import * from pyspark.sql import functions as F from pyspark.sql.window import Window sqlContext.sql("ADD JAR /data/w205Project/load/hive-serdes-1.0-SNAPSHOT.jar"); # sqlContext.sql("ADD JAR /usr/lib/hadoop/hadoop-aws.jar"); # sqlContext.sql("ADD JAR /usr/lib/hadoop/lib/aws-java-sdk-1.7.14.jar"); ############################################### # EXTRACT ALL THE LINKS INDISCRIMINATELY # ############################################### ''' links = sqlContext.sql("select entities.urls.url[0] as tco, entities.urls.expanded_url[0] as link from tweets where entities.urls.url[0] IS NOT NULL"); uniqueLInks = links.dropDuplicates(['tco', 'link']) uniqueLInks.repartition(1).save("s3n://w205twitterproject/links5","json") ''' ############################################### # ANALYZE # ###############################################
,A.ODS_ST_DATE AS ODS_ST_DATE ,A.ODS_SYS_ID AS ODS_SYS_ID FROM F_DP_CBOD_LNLNSLNS A -- INNER JOIN( SELECT MAX(ETLDT) AS ETLDT ,LN_LN_ACCT_NO ,FR_ID FROM F_DP_CBOD_LNLNSLNS GROUP BY LN_LN_ACCT_NO ,FR_ID) B -- ON A.LN_LN_ACCT_NO = B.LN_LN_ACCT_NO AND A.FR_ID = B.FR_ID AND A.ETLDT = B.ETLDT """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) MID_DP_CBOD_LNLNSLNS = sqlContext.sql(sql) MID_DP_CBOD_LNLNSLNS.registerTempTable("MID_DP_CBOD_LNLNSLNS") dfn = "MID_DP_CBOD_LNLNSLNS/" + V_DT + ".parquet" MID_DP_CBOD_LNLNSLNS.cache() nrows = MID_DP_CBOD_LNLNSLNS.count() MID_DP_CBOD_LNLNSLNS.write.save(path=hdfs + '/' + dfn, mode='overwrite') MID_DP_CBOD_LNLNSLNS.unpersist() #全量表保存后需要删除前一天数据 ret = os.system("hdfs dfs -rm -r /" + dbname + "/MID_DP_CBOD_LNLNSLNS/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert MID_DP_CBOD_LNLNSLNS lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
#任务[21] 001-01:: V_STEP = V_STEP + 1 sql = """ select A.ODS_CUST_TYPE as CUST_TYPE, A.ODS_CUST_ID as CUST_ID, A.FR_ID as FR_ID, CAST(COUNT(1) AS DECIMAL(22,2)) as INDEX_VALUE_COUNT, CAST(YEAR(V_DT)-YEAR(MIN(A.BEGIN_DATE)) AS DECIMAL(22,2)) as INDEX_VALUE_YEAR, V_YEAR_MONTH as YEAR_MONTH, V_DT as ETL_DATE, MIN(BEGIN_DATE) as BEGIN_DATE FROM OCRM_F_CI_SYS_RESOURCE A WHERE (A.ODS_SYS_ID = 'IBK' OR A.ODS_SYS_ID = 'MBK' OR A.ODS_SYS_ID = 'MSG' OR A.ODS_SYS_ID = 'APY' ) AND COALESCE(A.END_DATE,'2099-12-31') > V_DT GROUP BY A.ODS_CUST_ID,A.ODS_CUST_TYPE,A.FR_ID """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) sql = re.sub(r"\bV_YEAR_MONTH\b", "'"+V_YEAR_MONTH+"'", sql) OCRM_A_RESOURCE_EBANK_TARGET = sqlContext.sql(sql) dfn="OCRM_A_RESOURCE_EBANK_TARGET/"+V_DT+".parquet" OCRM_A_RESOURCE_EBANK_TARGET.write.save(path=hdfs + '/' + dfn, mode='overwrite') ret = os.system("hdfs dfs -rm -r /"+dbname+"/OCRM_A_RESOURCE_EBANK_TARGET/"+V_DT_LD+".parquet") et = datetime.now() print("Step %d start[%s] end[%s] use %d seconds") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds)
,A.RELATIVEGROUPSERIALNO AS RELATIVEGROUPSERIALNO ,A.LIMITATIONTERM AS LIMITATIONTERM ,A.MIGRATEFLAG AS MIGRATEFLAG ,A.PUTOUTORGID AS PUTOUTORGID ,A.ISINSURANCE AS ISINSURANCE ,A.ACTUALBAILSUM AS ACTUALBAILSUM ,A.BAILEXCHANGERATE AS BAILEXCHANGERATE ,A.FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'LNA' AS ODS_SYS_ID ,A.SUBBILLTYPE AS SUBBILLTYPE FROM O_LN_XDXT_BUSINESS_CONTRACT A --业务合同信息 """ sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql) F_LN_XDXT_BUSINESS_CONTRACT_INNTMP1 = sqlContext.sql(sql) F_LN_XDXT_BUSINESS_CONTRACT_INNTMP1.registerTempTable("F_LN_XDXT_BUSINESS_CONTRACT_INNTMP1") #F_LN_XDXT_BUSINESS_CONTRACT = sqlContext.read.parquet(hdfs+'/F_LN_XDXT_BUSINESS_CONTRACT/*') #F_LN_XDXT_BUSINESS_CONTRACT.registerTempTable("F_LN_XDXT_BUSINESS_CONTRACT") sql = """ SELECT DST.SERIALNO --流水号:src.SERIALNO ,DST.RELATIVESERIALNO --相关申请流水号:src.RELATIVESERIALNO ,DST.ARTIFICIALNO --人工编号:src.ARTIFICIALNO ,DST.OCCURDATE --发生日期:src.OCCURDATE ,DST.CUSTOMERID --客户编号:src.CUSTOMERID ,DST.CUSTOMERNAME --客户名称:src.CUSTOMERNAME ,DST.BUSINESSTYPE --业务品种:src.BUSINESSTYPE ,DST.OCCURTYPE --发生类型:src.OCCURTYPE ,DST.CREDITCYCLE --额度是否循环:src.CREDITCYCLE ,DST.CREDITAGGREEMENT --使用授信协议号:src.CREDITAGGREEMENT
,CI_CER_NO_1 AS CI_CER_NO_1 ,CI_FULL_NAM AS CI_FULL_NAM ,CI_NEW_CUST_NO AS CI_NEW_CUST_NO ,CI_CANCEL_FLG AS CI_CANCEL_FLG ,CI_CUST_TYP AS CI_CUST_TYP ,CI_CRT_SYS AS CI_CRT_SYS ,CI_CRT_SCT_N AS CI_CRT_SCT_N ,CI_CRT_ORG AS CI_CRT_ORG ,CI_CRT_OPR AS CI_CRT_OPR ,CI_UPD_SYS AS CI_UPD_SYS ,CI_UPD_ORG AS CI_UPD_ORG ,FR_ID AS FR_ID ,V_DT AS ODS_ST_DATE ,'CEN' AS ODS_SYS_ID FROM O_CI_CBOD_CICIFUNN A --对私合并档 """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) F_CI_CBOD_CICIFUNN = sqlContext.sql(sql) F_CI_CBOD_CICIFUNN.registerTempTable("F_CI_CBOD_CICIFUNN") dfn = "F_CI_CBOD_CICIFUNN/" + V_DT + ".parquet" F_CI_CBOD_CICIFUNN.cache() nrows = F_CI_CBOD_CICIFUNN.count() F_CI_CBOD_CICIFUNN.write.save(path=hdfs + '/' + dfn, mode='overwrite') F_CI_CBOD_CICIFUNN.unpersist() et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert F_CI_CBOD_CICIFUNN lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows)
sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf from pyspark import HiveContext from pyspark import SparkContext from eod import msft,candidate,download,merge,summary,spx def run(sc, sql_context, is_hive): msft.main(sc, sql_context, is_hive) candidate.main(sc, sql_context, is_hive) download.main(sc, sql_context, is_hive) merge.main(sc, sql_context, is_hive) spx.main(sc, sql_context, is_hive) print "------------------summary--------------------" summary.main(sc, sql_context, is_hive) if __name__ == '__main__': conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName="bintrade_candidate.eod.eod_run", master="yarn-client", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
#任务[21] 001-01:: V_STEP = V_STEP + 1 sql = """ SELECT CUST_ID AS CUST_ID ,PRODUCT_ID AS PRODUCT_ID ,'DK' AS TYPE ,FR_ID AS FR_ID FROM ACRM_F_CI_ASSET_BUSI_PROTO A --资产协议表 WHERE BAL > 0 GROUP BY CUST_ID ,PRODUCT_ID ,FR_ID """ sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql) ACRM_A_CI_CUST_PROD = sqlContext.sql(sql) ACRM_A_CI_CUST_PROD.registerTempTable("ACRM_A_CI_CUST_PROD") dfn = "ACRM_A_CI_CUST_PROD/" + V_DT + ".parquet" ACRM_A_CI_CUST_PROD.cache() nrows = ACRM_A_CI_CUST_PROD.count() ACRM_A_CI_CUST_PROD.write.save(path=hdfs + '/' + dfn, mode='overwrite') ACRM_A_CI_CUST_PROD.unpersist() ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_CI_CUST_PROD/" + V_DT_LD + ".parquet") et = datetime.now() print( "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_CI_CUST_PROD lines %d" ) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et - st).seconds, nrows) #任务[11] 001-02::
import sys from pyspark import SparkContext, SparkConf, HiveContext, SQLContext if __name__ == '__main__': conf = SparkConf().setAppName("Plotly Exports") sc = SparkContext(conf=conf) hive_context = HiveContext(sc) print '=== Creating Database ===' hive_context.sql('CREATE DATABASE PLOTLY') hive_context.sql('USE PLOTLY') print '=== Creating Table ===' hive_context.sql("CREATE TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010 " "(LOCATION STRING, ALCOHOL FLOAT) ROW FORMAT " "DELIMITED FIELDS TERMINATED BY ',' " "TBLPROPERTIES (\"skip.header.line.count\"=\"1\")") print "=== loading data into table ===" hive_context.sql("LOAD DATA LOCAL INPATH " "'/plotly_datasets/2010_alcohol_consumption_by_country.csv' " "OVERWRITE INTO TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010") sys.exit()
from pyspark import SparkContext, SparkConf from pyspark import HiveContext conf = SparkConf().setAppName("revenueByDaySQL") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) hiveContext.sql("use retail_db") sqlString = "SELECT orders.order_date, \ ROUND(SUM(order_items.order_item_subtotal), 2) the_sum, \ COUNT(DISTINCT orders.order_id) the_count\ FROM orders, order_items \ WHERE orders.order_id = order_items.order_item_order_id \ GROUP BY orders.order_date \ ORDER BY the_sum" joinded_aggregate_data = hiveContext.sql(sqlString) print str(joinded_aggregate_data.take(5))
# author [email protected] import os import sys local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") sys.path.append(local_path) from pyspark import SQLContext, SparkConf, HiveContext from pyspark import SparkContext from ml import diff_feature_reg,diff_train def run(sc, sql_context, is_hive): diff_feature_reg.main(sc, sql_context, is_hive = True) diff_train.main(sc, sql_context, is_hive = True) if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "32g") sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf) sqlContext = HiveContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "32") sqlContext.sql("use fex") run(sc, sqlContext, is_hive=True)
from pyspark import SparkContext, SparkConf, HiveContext conf = SparkConf().setAppName("most expensive product using SQL") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) sqlString = "SELECT p.product_name, p.product_price \ FROM retail_db.products p \ JOIN (SELECT max(products.product_price) max_id \ FROM retail_db.products) the_max \ ON \ p.product_price = the_max.max_id" result = hiveContext.sql(sqlString) print("***********************\n{0}".format(str(result.take(1))))