Ejemplo n.º 1
0
def get_spark_test():
    conf = SparkConf()
    sc = SparkContext("local[4]", appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    return sc, sql_context
Ejemplo n.º 2
0
def get_spark(num =4 , cores =4 , mem = "32g"):
    conf = SparkConf()
    conf.set("spark.executor.instances", "%d"%  num)
    conf.set("spark.executor.cores", "%d" % cores)
    conf.set("spark.executor.memory", "%s" % mem)
    sc = SparkContext(appName="youzan-algrithm", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex """)
    sql_context.setConf("spark.sql.shuffle.partitions", "16")

    return sc, sql_context
Ejemplo n.º 3
0
def main():
   if len(sys.argv) > 2:
      scriptPath = sys.argv[1]
      resultPath = sys.argv[2]
   else:
      print "Missing Arguments"

   sc = SparkContext("local", "Test sql queries from pyspark")
   try:
	hsc=HiveContext(sc)
        scriptRaw=str(sc.textFile(scriptPath,use_unicode=False).cache().collect())
        print scriptRaw
        result=open(resultPath,'w')
        for i in scriptRaw.split(';'):
           i=i.replace('[\'','')
           i=i.replace('\']','')
           print i
	   if not i=="":
		df=hsc.sql(i.strip())
                df.show()
           def printSeparator(cols):
                print 'inside print' + str(cols)
                for j in range(0,cols):
                   print j
                   result.write("+----")
                result.write("+--+")             
           printHeader=True
           printFooter=False
           cols=df.columns
           print cols
           for row in df.collect():
                print str(row)
                if printHeader:
                   print str(len(cols))
		   printSeparator(len(cols))
                   for col in cols:
                      result.write("| " + col)  
                   result.write("|")
                   printSeparator(len(cols))
                   printHeader=False
                   printFooter=True
                for v in row:
                   print str(v)
                   result.write("|" + valueToString(v))
                result.write("|")    
           if(printFooter):
                printSeparator(len(cols))          
   except:
	sc.stop()
Ejemplo n.º 4
0
ACRM_F_AG_AGREEMENT = sqlContext.read.parquet(hdfs+'/ACRM_F_AG_AGREEMENT/*')
ACRM_F_AG_AGREEMENT.registerTempTable("ACRM_F_AG_AGREEMENT")

#任务[21] 001-01::
V_STEP = V_STEP + 1

sql = """
	SELECT 
				 CUST_ID 	as CUST_ID,
				 ''			as CUST_NAME,
				 CUST_TYP	as CUST_TYP,
				 FR_ID		as FR_ID,
				 MIN(START_DATE) as START_DATE_MIN,
				 MAX(END_DATE)   as END_DATE,
				 MAX(START_DATE) as START_DATE_MAX,
				 int(COUNT(1)) 		 as COUNT, --INTEGER
				 V_DT		 as ETL_DATE			
	  FROM ACRM_F_AG_AGREEMENT        
	 WHERE END_DATE >= V_DT
	 GROUP BY CUST_ID,CUST_TYP,FR_ID
"""
sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
ACRM_A_AGREEMENT_TARGET = sqlContext.sql(sql)
dfn="ACRM_A_AGREEMENT_TARGET/"+V_DT+".parquet"
ACRM_A_AGREEMENT_TARGET.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ret = os.system("hdfs dfs -rm -r /"+dbname+"/ACRM_A_AGREEMENT_TARGET/"+V_DT_LD+".parquet")
ACRM_F_AG_AGREEMENT.unpersist()
et = datetime.now()
print("Step %d start[%s] end[%s] use %d seconds") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds)
Ejemplo n.º 5
0
		END 
		))                 
		END  as DECIMAL(18,2))       AS CUR_QUARTER_AVG --本年季均
	,CUST_TYP                                                   
FROM ACRM_F_RE_LENDSUMAVGINFO A 
WHERE YEAR=V_YEAR 
	AND a.FR_ID is not null AND A.PRDT_CODE IS NOT NULL 
GROUP BY a.FR_ID,CUST_ID,PRDT_CODE,CUST_TYP
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
sql = re.sub(r"V_MONTH", V_MONTH, sql)
sql = re.sub(r"\bV_QUARTER\b", V_QUARTER, sql)
sql = re.sub(r"\bV_YEAR\b", "'" + V_YEAR + "'", sql)
print(sql)
TMP_ACRM_F_CI_GK_LOAN_01 = sqlContext.sql(sql)
TMP_ACRM_F_CI_GK_LOAN_01.registerTempTable("TMP_ACRM_F_CI_GK_LOAN_01")
dfn = "TMP_ACRM_F_CI_GK_LOAN_01/" + V_DT + ".parquet"
TMP_ACRM_F_CI_GK_LOAN_01.cache()
nrows = TMP_ACRM_F_CI_GK_LOAN_01.count()
TMP_ACRM_F_CI_GK_LOAN_01.write.save(path=hdfs + '/' + dfn, mode='overwrite')
TMP_ACRM_F_CI_GK_LOAN_01.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/TMP_ACRM_F_CI_GK_LOAN_01/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert TMP_ACRM_F_CI_GK_LOAN_01 lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)

#任务[21] 001-02::
Ejemplo n.º 6
0
#任务[11] 001-01::
V_STEP = V_STEP + 1

sql = """
 SELECT monotonically_increasing_id() AS ID 
       ,REPORTNO                AS REPORTNO 
       ,OBJECTTYPE              AS OBJECTTYPE 
       ,OBJECTNO                AS OBJECTNO 
       ,MODELNO                 AS MODELNO 
       ,REPORTNAME              AS REPORTNAME 
       ,INPUTTIME               AS INPUTTIME 
       ,ORGID                   AS ORGID 
       ,USERID                  AS USERID 
       ,UPDATETIME              AS UPDATETIME 
       ,VALIDFLAG               AS VALIDFLAG 
       ,ODS_ST_DATE             AS ODS_ST_DATE 
   FROM F_CI_XDXT_IND_REPORT_RECORD A                          --个人财务报表数据记录表
"""

sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
OCRM_F_IND_REPORT_RECORD = sqlContext.sql(sql)
OCRM_F_IND_REPORT_RECORD.registerTempTable("OCRM_F_IND_REPORT_RECORD")
dfn="OCRM_F_IND_REPORT_RECORD/"+V_DT+".parquet"
OCRM_F_IND_REPORT_RECORD.cache()
nrows = OCRM_F_IND_REPORT_RECORD.count()
OCRM_F_IND_REPORT_RECORD.write.save(path=hdfs + '/' + dfn, mode='append')
OCRM_F_IND_REPORT_RECORD.unpersist()
et = datetime.now()
print("Step %d start[%s] end[%s] use %d seconds, insert OCRM_F_IND_REPORT_RECORD lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
Ejemplo n.º 7
0
       ,CI_CRT_SCT_N            AS CI_CRT_SCT_N 
       ,CI_CRT_OPR              AS CI_CRT_OPR 
       ,CI_UPD_SYS              AS CI_UPD_SYS 
       ,CI_UPD_OPR              AS CI_UPD_OPR 
       ,CI_CRT_ORG              AS CI_CRT_ORG 
       ,CI_UPD_ORG              AS CI_UPD_ORG 
       ,CI_DB_PART_ID           AS CI_DB_PART_ID 
       ,CI_INSTN_COD            AS CI_INSTN_COD 
       ,FR_ID                   AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'CEN'                   AS ODS_SYS_ID 
   FROM O_CI_CBOD_CICIFADR A                                   --对私客户地址信息档
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
F_CI_CBOD_CICIFADR_INNTMP1 = sqlContext.sql(sql)
F_CI_CBOD_CICIFADR_INNTMP1.registerTempTable("F_CI_CBOD_CICIFADR_INNTMP1")

F_CI_CBOD_CICIFADR = sqlContext.read.parquet(hdfs + '/F_CI_CBOD_CICIFADR/*')
F_CI_CBOD_CICIFADR.registerTempTable("F_CI_CBOD_CICIFADR")
sql = """
 SELECT DST.ETLDT                                               --平台日期:src.ETLDT
       ,DST.FK_CICIF_KEY                                       --FK_CICIF_KEY:src.FK_CICIF_KEY
       ,DST.CI_ADDR_COD                                        --地址编码:src.CI_ADDR_COD
       ,DST.CI_ADDR                                            --地址:src.CI_ADDR
       ,DST.CI_POSTCOD                                         --邮政编码:src.CI_POSTCOD
       ,DST.CI_TEL_NO                                          --电话号码:src.CI_TEL_NO
       ,DST.CI_CNTY_COD                                        --国家代码:src.CI_CNTY_COD
       ,DST.CI_AREA_COD                                        --地区代码:src.CI_AREA_COD
       ,DST.CI_SUB_TEL                                         --分机:src.CI_SUB_TEL
       ,DST.CI_MOBILE_PHONE                                    --移动电话:src.CI_MOBILE_PHONE
Ejemplo n.º 8
0
       ,''                    AS UPDATEDATE 
       ,A.REMARK                AS REMARK 
       ,A.REPORTLOCKED          AS REPORTLOCKED 
       ,A.MODELCLASS            AS MODELCLASS 
       ,''                    AS FARMILYID 
       ,A.FR_ID                 AS CORPORATEORGID 
       ,A.ODS_ST_DATE           AS ODS_ST_DATE 
       ,monotonically_increasing_id() AS ID 
   FROM F_CI_XDXT_CUSTOMER_FSRECORD A                          --客户财务报表(CF)
   LEFT JOIN OCRM_F_CI_SYS_RESOURCE B                          --系统来源中间表
     ON A.CUSTOMERID            = B.SOURCE_CUST_ID 
    AND A.FR_ID                 = B.FR_ID 
    AND B.ODS_SYS_ID            = 'LNA' """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
OCRM_F_CI_CUSTOMER_FSRECORD = sqlContext.sql(sql)
OCRM_F_CI_CUSTOMER_FSRECORD.registerTempTable("OCRM_F_CI_CUSTOMER_FSRECORD")
dfn = "OCRM_F_CI_CUSTOMER_FSRECORD/" + V_DT + ".parquet"
OCRM_F_CI_CUSTOMER_FSRECORD.cache()
nrows = OCRM_F_CI_CUSTOMER_FSRECORD.count()
OCRM_F_CI_CUSTOMER_FSRECORD.write.save(path=hdfs + '/' + dfn, mode='overwrite')
OCRM_F_CI_CUSTOMER_FSRECORD.unpersist()
F_CI_XDXT_CUSTOMER_FSRECORD.unpersist()
OCRM_F_CI_SYS_RESOURCE.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname +
                "/OCRM_F_CI_CUSTOMER_FSRECORD/" + V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert OCRM_F_CI_CUSTOMER_FSRECORD lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
       ,A.MODULEID              AS MODULEID 
       ,A.CHANNELID             AS CHANNELID 
       ,A.LOGINTYPE             AS LOGINTYPE 
       ,A.USERSEQ               AS USERSEQ 
       ,A.DEPTSEQ               AS DEPTSEQ 
       ,A.UNIQUEUSERID          AS UNIQUEUSERID 
       ,A.USERAGENT             AS USERAGENT 
       ,A.CHANNELJNLNO          AS CHANNELJNLNO 
       ,A.FR_ID                 AS FR_ID 
       ,'IBK'                   AS ODS_SYS_ID 
       ,V_DT                    AS ODS_ST_DATE 
   FROM O_TX_WSYH_ACCESSLOGMCH A                               --访问日志表
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
F_TX_WSYH_ACCESSLOGMCH_INNTMP1 = sqlContext.sql(sql)
F_TX_WSYH_ACCESSLOGMCH_INNTMP1.registerTempTable(
    "F_TX_WSYH_ACCESSLOGMCH_INNTMP1")

#F_TX_WSYH_ACCESSLOGMCH = sqlContext.read.parquet(hdfs+'/F_TX_WSYH_ACCESSLOGMCH/*')
#F_TX_WSYH_ACCESSLOGMCH.registerTempTable("F_TX_WSYH_ACCESSLOGMCH")
sql = """
 SELECT DST.ACCESSJNLNO                                         --ACCESSJNLNO:src.ACCESSJNLNO
       ,DST.ACCESSSTATE                                        --ACCESSSTATE:src.ACCESSSTATE
       ,DST.ACCESSDATE                                         --ACCESSDATE:src.ACCESSDATE
       ,DST.MACHINEID                                          --MACHINEID:src.MACHINEID
       ,DST.ACCESSIP                                           --ACCESSIP:src.ACCESSIP
       ,DST.MODULEID                                           --模块代号:src.MODULEID
       ,DST.CHANNELID                                          --渠道编号:src.CHANNELID
       ,DST.LOGINTYPE                                          --登陆类型:src.LOGINTYPE
       ,DST.USERSEQ                                            --用户顺序号:src.USERSEQ
Ejemplo n.º 10
0
    args = sys.argv[1:]  # 接受参数,run_date、数据库名db_name, 两个参数
    # 未传入任何参数,使用默认值,ctprod
    if len(args) == 0:
        run_date = compute_date(time.strftime("%Y%m%d"), -1)  # 默认run_date
        db_name = 'ctprod'  # 默认数据库名
    elif len(args) == 1:
        run_date = args[0]
        db_name = 'ctprod'  # 默认数据库名
    else:
        run_date = args[0]
        db_name = args[1]

    today = run_date

    ### 分区建表检查
    cust_diagnosis_cols = hqlContext.sql(
        'desc tag_stat.cust_diagnosis').toPandas()
    if 'part_date' not in cust_diagnosis_cols.col_name.values:
        raise ValueError(u'错误!cust_diagnosis表不是分区表!')

    # hqlContext.sql('''
    # create external table if not exists tag_stat.cust_diagnosis
    # (
    # customer_no string,
    # customer_diagnosis string,
    # calendar_year string
    # )
    # partitioned by (part_date varchar(8))
    # stored as parquet
    # ''')

    ### 生成当日客户诊断并插入分区
Ejemplo n.º 11
0
       ,A.BONUS_INTG_SUM        AS INDEX_VALUE 
       ,SUBSTR(V_DT, 1, 7)                       AS YEAR_MONTH 
       ,V_DT                    AS ETL_DATE 
       ,A.CUST_TYPE             AS CUST_TYPE 
       ,A.FR_ID                 AS FR_ID 
   FROM OCRM_F_CI_CUSTLNAINFO A                                --客户信用信息
   WHERE    EXISTS
   (SELECT 1 FROM 
   OCRM_F_CI_CUSTLNAINFO B 
   WHERE A.FR_ID=B.FR_ID 
   AND A.CUST_ID = B.CUST_ID 
   AND A.EXP_DATE =B.EXP_DATE 
   AND B.EXP_DATE > V_DT) """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
ACRM_A_TARGET_D002018 = sqlContext.sql(sql)
ACRM_A_TARGET_D002018.registerTempTable("ACRM_A_TARGET_D002018")
dfn = "ACRM_A_TARGET_D002018/" + V_DT + ".parquet"
ACRM_A_TARGET_D002018.cache()
nrows = ACRM_A_TARGET_D002018.count()
ACRM_A_TARGET_D002018.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ACRM_A_TARGET_D002018.unpersist()
OCRM_F_CI_CUSTLNAINFO.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_TARGET_D002018/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_TARGET_D002018 lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
Ejemplo n.º 12
0
def load_data():
    # load data from files
    # and return query results / aggregates.

    hiveContext = HiveContext(sc)
    # 1027
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/'
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/'

    # AMAZON AWS EMR
    path = 'hdfs:///tmp/files/'    #HDFS


    # new segement files

    tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \
    path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\
    path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \
    path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ]

    # small file for debugging
    # 1027

    # tx_files = [path + 's_l_male_30_40_smaller_cities.csv']
    # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv']

    # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\
    #         ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\
    #         ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\
    #         ,path+'millenials.csv',path+'young_adults.csv']

    # 1027
    # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\
    #         ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\
    #         ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\
    #         ,path+'l_millenials.csv',path+'l_young_adults.csv']



    all_tx = sc.textFile(','.join(tx_files),600)

    # 1027
    # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long'
    txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long'
    txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')]
    txFields[17] = StructField('trans_date', DateType(), True)

    txSchema = StructType(txFields)
    # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long
    txHeader = all_tx.filter(lambda l: "ssn|" in l)
    txNoHeader = all_tx.subtract(txHeader)

    temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: (
    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16],
    datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25]))

    h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema)
    h_tx_df.registerTempTable("htx")

    hiveContext.cacheTable("htx")

    # HBASE CODE HERE
    # create dataframe with all records
    # map using hbase_process to extract record into individual componenets
    # and create a dictionary to store in hbase
    #h_data = hiveContext.sql("SELECT * FROM htx")
    #h_data.map(hbase_process).foreachPartition(store_full_data)

    # get cust mean time between transactions
    time_lag_eval = hiveContext.sql(
    "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx order by cc_num, unix_time asc")
    time_lag_eval.registerTempTable("ts_lag")

    user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num")
    user_avg_time.registerTempTable("avg_time")


    # get cust mean per category
    mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category")
    mean_per_cat.registerTempTable("mean_per_cat")

    # evaluate amount for HML and time of purchase for normal/abnormal
    test = hiveContext.sql(
    # #    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    test.registerTempTable("full_table")

    # evaluate for transaction time (HML)
    full_data = hiveContext.sql(
         "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num")
    full_data.registerTempTable("full_data")


    # return full tx data for user with reduced HML/AN/HML variables
    per_cust_transactions = hiveContext.sql(
        "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc")

    # return full tx data for profile with reduced HML/NP/HML variables in sorted order
    #pre_sort_
    per_profile_transactions = hiveContext.sql(
        "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc")
    #pre_sort_per_profile_transactions.registerTempTable("pre_sort")



    # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort
    #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort")

    # gets pre-computed reference values for each customer and stores in redis
    #   avg spent per category
    #   n transactions
    #   last unix time stamp
    agg_info = hiveContext.sql(
        "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat")
    avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])


    agg_n_tx = hiveContext.sql(
        "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num")
    n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)])

    agg_unix_ts = hiveContext.sql(
        "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num")
    n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)])

    agg_vel_info = hiveContext.sql(
        "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time")
    avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])




    # compile our final string per customer for all tx's
    per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])



    # compile our final string per profile for all tx's
    per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])


    # return tx data and aggregates
    return_dict = {}
    return_dict['profile'] = per_profile_transactions_r
    return_dict['customer'] = per_cust_transactions_r

    return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data
from pyspark import SparkContext, SparkConf
from pyspark import HiveContext
'''
Problem statement: find the 5 most expensive orders per person per day using SparkSQL.
'''
conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonSQL")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
#makes sure that the 'retail_db' hive database will be used
hiveContext.sql("use retail_db")

#firstvalue() must be used because of a bug.  
#Without it, columns that are not in the group by or the aggregation part cannot be shown.
sqlString = "SELECT \
			first_value(customers.customer_fname), \
			first_value(customers.customer_lname), \
			orders.order_date, \
			ROUND(SUM(order_items.order_item_subtotal), 2) the_total\
		FROM customers, orders, order_items \
		WHERE 	orders.order_id = order_items.order_item_order_id \
			AND \
			customers.customer_id = orders.order_customer_id \
		GROUP BY orders.order_date, customers.customer_id \
		ORDER BY the_total DESC"

result = hiveContext.sql(sqlString).rdd #rdd used because this is certification practice
top_records = result.take(5)
print "*****************\n{0}".format(str(top_records))


Ejemplo n.º 14
0
sql = """
	   SELECT 
		 CUST_ID as CUST_ID,
		 CUST_TYP as CUST_TYP,
		 FR_ID,
		 CHANNEL_FLAG as CHANNEL_FLAG,
		 CAST(COUNT(1) AS DECIMAL(22,2)) as INDEX_VALUE_C,   --DECIMAL(22,2)
		 CAST(COALESCE(SUM(SA_DR_AMT),0) - COALESCE(SUM(SA_CR_AMT),0)  AS DECIMAL(22,2)) as SA_AMT,
		 CAST(SUM(SA_TVARCHAR_AMT) AS DECIMAL(22,2)) as SA_TX_AMT,
		 '' 		as SA_TX_DT,
		 V_DT10		as ETL_DATE				
	  FROM ACRM_F_CI_NIN_TRANSLOG A
	 WHERE  --A.FR_ID = V_FR_ID
	    A.SA_TVARCHAR_DT = V_DT
	 GROUP BY FR_ID,CUST_ID,CUST_TYP,CHANNEL_FLAG
"""
sql = re.sub(r"\bV_DT\b", "'"+V_DT+"'", sql)
sql = re.sub(r"\bV_DT10\b", "'"+V_DT10+"'", sql)
ACRM_A_TRANSLOG_TARGET = sqlContext.sql(sql)
ACRM_A_TRANSLOG_TARGET.registerTempTable("ACRM_A_TRANSLOG_TARGET")
dfn="ACRM_A_TRANSLOG_TARGET/"+V_DT+".parquet"
ACRM_A_TRANSLOG_TARGET.cache()
nrows = ACRM_A_TRANSLOG_TARGET.count()
ACRM_A_TRANSLOG_TARGET.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ACRM_A_TRANSLOG_TARGET.unpersist()
#全量表保存后需要删除前一天数据
ret = os.system("hdfs dfs -rm -r /"+dbname+"/ACRM_A_TRANSLOG_TARGET/"+V_DT_LD+".parquet")
et = datetime.now()
print("Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_TRANSLOG_TARGET lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
Ejemplo n.º 15
0
    sqlstr = sqlstr[: len(sqlstr)-2]
    sqlstr += "\n) stored as orc"
    print sqlstr

    sql_context.sql(sqlstr)
    df.insertInto(tableName, overwrite)



if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")


    ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df,  "test_eod_AAA")

Ejemplo n.º 16
0

def save(lp,path, sc):
    lp.saveAsTextFile(path)


def main(sc, sql_context, is_hive = True):
    df_train =  get_train(sc, sql_context, is_hive)
    df_check = get_check(sc, sql_context, is_hive)
    lp_train = cal_feature(df_train, 60,3)
    lp_check = cal_feature(df_check, 60,3)

    os.system("""
    source ~/.bashrc; hadoop fs -rm -r bintrade.ml.diff.label_point.train.cls; hadoop fs -rm -r bintrade.ml.diff.label_point.check.cls
    """)
    save(lp_train, "bintrade.ml.diff.label_point.train.cls", sc)
    save(lp_check, "bintrade.ml.diff.label_point.check.cls", sc)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="bintrade.ml.diff_feature", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex """)
    main(sc, sql_context, is_hive=True)
    sc.stop()

Ejemplo n.º 17
0
        CREATE TABLE IF NOT EXISTS %s(
           symbol string,
           date1 string,
           date2 string,
           date3 string,
           close1 float,
           spxopen1 float, 
           close2 float,
           spxopen2 float, 
           act_diff float,
           prob float,
           label float
         )
         ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
         """ % "check_pred_ext")

    df.insertInto("check_pred_ext", overwrite = True)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "8")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")
    sc = SparkContext(appName=__file__, conf=conf)
    sql_context = HiveContext(sc)
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    sql_context.sql("use fex")
    main(sc, sql_context)
    sc.stop()
Ejemplo n.º 18
0
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from post import post_run
from ta import mat_close
from ta import adx
from ml import diff_feature_cls as feature
from ml import diff_train_cls_pos_ml as train


def main(sc, sql_context, is_hive):
    #post_run.main(sc, sql_context, is_hive = True)
    #mat_close.main(sc, sql_context, is_hive = True)
    #adx.main(sc, sql_context, is_hive = True)
    feature.main(10, 1, sc, sql_context, is_hive = True)
    train.main("2010-01-01", "2010-04-30", "2010-05-01", "9999-99-99", sc, sql_context, is_hive=True)
if __name__ == "__main__":
    conf = SparkConf()
    #conf.set("spark.executor.instances", "4")
    #conf.set("spark.executor.cores", "4")
    #conf.set("spark.executor.memory", "8g")

    sc = SparkContext(appName="bintrade_candidate", master="local[2]", conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "1")
    sqlContext.sql("use fex_test")
    main(sc, sqlContext, is_hive=True)
Ejemplo n.º 19
0
print "=> filename %s" % filename
print "=> saveto %s" % saveto
print "=> jobName %s" % jobName
print "=> allMVs %s" % allMVs

conf = SparkConf().setAppName(jobName)
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
sqlc = HiveContext(sc)

T_XRD_HDFS = sqlc.read.format("json").load(filename)
T_XRD_HDFS.registerTempTable("T_XRD_HDFS")
print "=> rowsCounter T_XRD_HDFS: %d" % T_XRD_HDFS.count()
# T_XRD_HDFS.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_HDFS")

T_XRD_RAW_FILE = sqlc.sql("SELECT from_unixtime(end_time, 'yyyy/MM/dd') as TDay, start_time as ots, end_time as cts, file_lfn, client_host, if(server_username = '', 'unknown', server_username) as server_username, (end_time - start_time) as proctime, read_bytes_at_close as readbytes FROM T_XRD_HDFS WHERE (end_time - start_time) > 0 AND read_bytes_at_close > 0 AND `_corrupt_record` IS NULL")
T_XRD_RAW_FILE.registerTempTable("T_XRD_RAW_FILE")
print "=> rowsCounter T_XRD_RAW_FILE: %d" % T_XRD_RAW_FILE.count()
if allMVs == 1:
    T_XRD_RAW_FILE.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_RAW_FILE")
 
T_XRD_LFC = sqlc.read.format("com.databricks.spark.csv").load("/project/awg/cms/phedex/catalog/csv/merged/").toDF("dataset_name", "dataset_id", "dataset_is_open", "dataset_time_create", "block_name", "block_id", "block_time_create", "block_is_open", "file_lfn", "file_id", "filesize", "usernameXX", "checksum", "file_time_create")
T_XRD_LFC.registerTempTable("T_XRD_LFC")
print "=> rowsCounter T_XRD_LFC: %d" % T_XRD_LFC.count()
if allMVs == 1:
    T_XRD_LFC.write.format("com.databricks.spark.csv").option("header", "false").save(saveto + "/T_XRD_LFC")
 
V_XRD_LFC_aggr1 = sqlc.sql("SELECT distinct block_name, dataset_name FROM T_XRD_LFC")
V_XRD_LFC_aggr1.registerTempTable("V_XRD_LFC_aggr1")
print "=> rowsCounter V_XRD_LFC_aggr1: %d" % V_XRD_LFC_aggr1.count()
if allMVs == 1:
Ejemplo n.º 20
0
V_STEP = 0

O_CI_WSYH_EGROUPRELATION = sqlContext.read.parquet(hdfs+'/O_CI_WSYH_EGROUPRELATION/*')
O_CI_WSYH_EGROUPRELATION.registerTempTable("O_CI_WSYH_EGROUPRELATION")

#任务[21] 001-01::
V_STEP = V_STEP + 1

sql = """
 SELECT A.CIFSEQ                AS CIFSEQ 
       ,A.ASSOCIFSEQ            AS ASSOCIFSEQ 
       ,A.ASSOTYPE              AS ASSOTYPE 
       ,A.ASSOSTATE             AS ASSOSTATE 
       ,A.FR_ID                 AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'IBK'                   AS ODS_SYS_ID 
   FROM O_CI_WSYH_EGROUPRELATION A                             --集团关系表
"""

sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
F_CI_WSYH_EGROUPRELATION = sqlContext.sql(sql)
F_CI_WSYH_EGROUPRELATION.registerTempTable("F_CI_WSYH_EGROUPRELATION")
dfn="F_CI_WSYH_EGROUPRELATION/"+V_DT+".parquet"
F_CI_WSYH_EGROUPRELATION.cache()
nrows = F_CI_WSYH_EGROUPRELATION.count()
F_CI_WSYH_EGROUPRELATION.write.save(path=hdfs + '/' + dfn, mode='overwrite')
F_CI_WSYH_EGROUPRELATION.unpersist()
ret = os.system("hdfs dfs -rm -r /"+dbname+"/F_CI_WSYH_EGROUPRELATION/"+V_DT_LD+".parquet")
et = datetime.now()
print("Step %d start[%s] end[%s] use %d seconds, insert F_CI_WSYH_EGROUPRELATION lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
from pyspark import SparkContext, SparkConf
from pyspark import HiveContext
'''
Problem statement: find the 5 most expensive orders per person per day using Data Frames
'''

conf = SparkConf().setAppName("mostExpensiveOrderPerDayPerPersonRDD")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
#makes sure that the 'retail_db' hive database will be used
hiveContext.sql("use retail_db")

#loading the data from hive into dataframes
orders = hiveContext.sql("select order_id, order_date, order_customer_id from orders")
customers = hiveContext.sql("select customer_id, customer_fname, customer_lname from customers")
order_items = hiveContext.sql("select order_item_order_id, order_item_subtotal from order_items")

#joining the customers with orders on customer_id.  Orders and customers are the smaller tables
#so I try to join small tables with other small tables before joining to a big table.
orders_join_customers = orders.join(customers, orders.order_customer_id == customers.customer_id)

#joining on order_id so that I get rows with a customer and their purchases
orders_customers_join_order_items = \
	orders_join_customers.join(order_items, orders_join_customers.order_id == \
							order_items.order_item_order_id)
#aggregating by order_date and customer_id with the sum aggregation.
#This finds how much a person spent on a single day
aggResult = orders_customers_join_order_items.groupBy(['order_date','customer_id']).agg({"order_item_subtotal": "sum"}).withColumnRenamed("sum(order_item_subtotal)", "subtotal_sum")

#because in the aggregation the order_date, customer, and sum were generated in a data frame,
#I must unfortunatly rejoin to the customers table to display the purchases with names
from pyspark import SparkConf, SparkContext, HiveContext

conf = SparkConf().setAppName("Revenue per category")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)

hiveContext.sql("use retail_db")

sqlString = 	"SELECT	first_value(cat.category_name), \
			round(sum(oi.order_item_subtotal), 2)  category_revenue\
		FROM 	categories cat, products prod, order_items oi \
		WHERE 	cat.category_id = prod.product_category_id  \
			AND \
			prod.product_id = oi.order_item_product_id \
		GROUP BY cat.category_id \
		ORDER BY category_revenue DESC"

result = hiveContext.sql(sqlString)
collected = result.collect()
print "*****************\n{0}".format("\n".join([str(x) for x in collected]))










df_hive=hivecontext.read.load(examplefiles_path + "users.parquet")

df_sql.show()
df_hive.show()

df_hive.printSchema()

df_hive.filter(df_hive['favorite_color']=='red').show()

#Test 3: Write selected columns from dataframe into a parquet file

if not os.path.exists(examplefiles_path + "nameAndFavColors.parquet"):
   df_hive.select("name","favorite_color").write.save(examplefiles_path + "nameAndFavColors.parquet")

#Test 4: Save dataframe as persistent hive table using hivecontext
hivecontext.sql("DROP TABLE IF EXISTS users")
df_hive.write.saveAsTable("users")


#Test 5: Read from the hive table into a parquet file
colorRed=hivecontext.sql("SELECT * FROM users WHERE favorite_color=='red' ")
colorRed.show()
if not os.path.exists(examplefiles_path + "red.parquet"):
   colorRed.write.parquet(examplefiles_path + "red.parquet")

#Test 6: Create Parquet hive table. Read data from a parquet file and store it in the table
filepath=examplefiles_path + "users.parquet"
load_query="LOAD DATA LOCAL INPATH '"+filepath+"'OVERWRITE INTO TABLE parquetTests"
hivecontext.sql("CREATE TABLE IF NOT EXISTS  parquetTests(name STRING,favorite_color STRING,favorite_numbers ARRAY<INT>) STORED AS PARQUET")
hivecontext.sql(load_query)
results=hivecontext.sql("SELECT * FROM parquetTests")
Ejemplo n.º 24
0
       ,PD_DEP_TYP_DESC         AS PD_DEP_TYP_DESC 
       ,PD_ODRW_CASH_AMT        AS PD_ODRW_CASH_AMT 
       ,PD_TDRW_CASH_AMT        AS PD_TDRW_CASH_AMT 
       ,PD_IDEP_CASH_AMT        AS PD_IDEP_CASH_AMT 
       ,PD_IDEP_TRAN_AMT        AS PD_IDEP_TRAN_AMT 
       ,PD_IDRW_CASH_AMT        AS PD_IDRW_CASH_AMT 
       ,PD_IDRW_TRAN_AMT        AS PD_IDRW_TRAN_AMT 
       ,PD_TRAN_FLG             AS PD_TRAN_FLG 
       ,PD_INFDRW_SAME_FLG      AS PD_INFDRW_SAME_FLG 
       ,FR_ID                   AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'CEN'                   AS ODS_SYS_ID 
   FROM O_CM_CBOD_PDPRTTDP A                                   --定期利息部件
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
F_CM_CBOD_PDPRTTDP = sqlContext.sql(sql)
F_CM_CBOD_PDPRTTDP.registerTempTable("F_CM_CBOD_PDPRTTDP")
dfn = "F_CM_CBOD_PDPRTTDP/" + V_DT + ".parquet"
F_CM_CBOD_PDPRTTDP.cache()
nrows = F_CM_CBOD_PDPRTTDP.count()
F_CM_CBOD_PDPRTTDP.write.save(path=hdfs + '/' + dfn, mode='overwrite')
F_CM_CBOD_PDPRTTDP.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/F_CM_CBOD_PDPRTTDP/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert F_CM_CBOD_PDPRTTDP lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
Ejemplo n.º 25
0
from textblob import TextBlob
from unidecode import unidecode
import csv
import json

# Note - SparkContext available as sc, HiveContext available as sqlCtx.
from pyspark import SparkContext
from pyspark import HiveContext
from pyspark.streaming import StreamingContext

sc = SparkContext(appName="AnaliseSentimentosTwitter")
sqlCtx = HiveContext(sc)

tweets = sqlCtx.sql("select text from trabalho_translated_tweets")

#Contadores
numPos = 0
numNeg = 0
numNul = 0
total = 0

tweetTexts = tweets.map(lambda p: "text: " + p.text)

#deve ser substituido pela qtd de tweetTexts. Foi colocado assim apenas para testes.
qtd = len(tweetTexts.collect())

for text in tweetTexts.collect():

    # analisando o sentimento
    sentiment = TextBlob(text)
Ejemplo n.º 26
0
     ON A.ID                    = B.CUST_BASE_ID 
    AND B.FR_ID                 = A.FR_ID 
   LEFT JOIN ACRM_F_DP_SAVE_INFO C                             --负债协议
     ON B.CUST_ID               = C.CUST_ID 
    AND C.ACCT_STATUS           = '01' 
    AND C.FR_ID                 = A.FR_ID 
   LEFT JOIN ACRM_F_CI_ASSET_BUSI_PROTO D                      --资产协议
     ON B.CUST_ID               = D.CUST_ID 
    AND D.LN_APCL_FLG           = 'N' 
    AND D.FR_ID                 = A.FR_ID 
  GROUP BY A.FR_ID 
       ,A.ID 
       ,A.CUST_BASE_CREATE_NAME 
       ,A.CUST_BASE_CREATE_ORG """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
ACRM_A_BASE_DEP_CRE = sqlContext.sql(sql)
ACRM_A_BASE_DEP_CRE.registerTempTable("ACRM_A_BASE_DEP_CRE")
dfn = "ACRM_A_BASE_DEP_CRE/" + V_DT + ".parquet"
ACRM_A_BASE_DEP_CRE.cache()
nrows = ACRM_A_BASE_DEP_CRE.count()
ACRM_A_BASE_DEP_CRE.write.save(path=hdfs + '/' + dfn, mode='append')
ACRM_A_BASE_DEP_CRE.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_BASE_DEP_CRE/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_BASE_DEP_CRE lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
from pyspark.sql.functions import lit, row_number, monotonically_increasing_id, isnan, when, count, col, avg, udf
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType

conf = SparkConf().setMaster("yarn").set('spark.sql.warehouse.dir',
                                         '/user/hive/warehouse')
sc = SparkContext(conf=conf)

sqlContext = HiveContext(sc)

print('1. Cleaning Data')
print('1.1 Importing Raw Data')
In_Df = sqlContext.read.csv('/Input/loan.csv', header=True, inferSchema=True)

#Create the database for working with
sqlContext.sql("""create database work_db""")

#Add an ID variable
print('1.1 Creating ID variable')
In_Df = In_Df.withColumn('id', monotonically_increasing_id())
In_Df = In_Df.drop('member_id')  #Drop Member ID because it is Blank

#Drop columns we dont need
print('1.2 Dropping unneccesary columns')
In_Df = In_Df.drop('url')  #A cryptic column with only one value
In_Df = In_Df.drop(
    'desc')  # A Text column, not much can be extracted from this
In_Df = In_Df.drop('zip_code')  #Unrelated to default rates

#Create binary default variable
print('1.3 Creating binary Default variable')
Ejemplo n.º 28
0
        .toDF(["sku","query"]).registerTempTable("tmp_table")

    insert_sql = """
        insert overwrite table {table_name} partition(dt='{dt}')
        select  * from tmp_table
        """.format(table_name=table_name, dt=dt_str)
    print("insert_sql:\n" + insert_sql)
    hiveCtx.sql(insert_sql)


if __name__ == "__main__":
    conf = SparkConf()
    sc = SparkContext(conf=conf, appName="sp_ind")
    sc.setLogLevel("WARN")
    hiveCtx = HiveContext(sc)
    hiveCtx.setConf('spark.shuffle.consolidateFiles', 'true')
    hiveCtx.setConf('spark.shuffle.memoryFraction', '0.4')
    hiveCtx.setConf('spark.sql.shuffle.partitions', '1000')
    if len(sys.argv) == 1:
        dt = datetime.datetime.now() + datetime.timedelta(-1)
    else:
        dt = datetime.datetime.strptime(sys.argv[1], "%Y%m%d").date()

    dt_str = dt.strftime("%Y-%m-%d")
    yest_dt = dt + datetime.timedelta(-30)
    yest_str = yest_dt.strftime("%Y-%m-%d")

    hiveCtx.sql("use app")
    create_table(hiveCtx)
    getQuery(hiveCtx)
Ejemplo n.º 29
0
       ,V_DT                    AS ODS_ST_DATE 
       ,A.ALL_COUNT             AS ALL_COUNT 
       ,A.ALL_MONEY             AS OLD_YEAR_BAL_SUM 
       ,CAST(A.ALL_COUNT AS DECIMAL(24,6))            AS OLD_YEAR_BAL 
       ,A.AMOUNT                AS AMOUNT 
       ,A.COUNT                 AS COUNT 
       ,A.LAST_COUNT            AS LAST_COUNT 
       ,B.FR_ID                 AS FR_ID 
   FROM ACRM_F_RE_INSUSUMINFO A                                --保险账户累计表
   LEFT JOIN ADMIN_AUTH_ORG B                                  --机构表
     ON A.ORG_NO                = B.ORG_ID 
  WHERE A.YEAR                  = SUBSTR(V_DT,1,4) - 1 
    AND V_DT                    = CONCAT(SUBSTR(V_DT,1,4),'-01-01') """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
ACRM_F_RE_INSUSUMINFO = sqlContext.sql(sql)
ACRM_F_RE_INSUSUMINFO.registerTempTable("ACRM_F_RE_INSUSUMINFO")
dfn = "ACRM_F_RE_INSUSUMINFO/" + V_DT + ".parquet"
ACRM_F_RE_INSUSUMINFO.cache()
nrows = ACRM_F_RE_INSUSUMINFO.count()
ACRM_F_RE_INSUSUMINFO.write.save(path=hdfs + '/' + dfn, mode='append')
ACRM_F_RE_INSUSUMINFO.unpersist()
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert ACRM_F_RE_INSUSUMINFO lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)

#任务[12] 001-02::
V_STEP = V_STEP + 1
Ejemplo n.º 30
0
sql = """
 SELECT BRNO                    AS BRNO 
       ,BANKNOTYPE              AS BANKNOTYPE 
       ,STATUS                  AS STATUS 
       ,BANKNO                  AS BANKNO 
       ,OBANKNO                 AS OBANKNO 
       ,EFCTDAT                 AS EFCTDAT 
       ,NOTE1                   AS NOTE1 
       ,NOTE2                   AS NOTE2 
       ,FR_ID                   AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'AFA'                   AS ODS_SYS_ID 
   FROM O_CM_AFA_BRNOBANKMAP A                                 --机构行号对应表
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
F_CM_AFA_BRNOBANKMAP = sqlContext.sql(sql)
F_CM_AFA_BRNOBANKMAP.registerTempTable("F_CM_AFA_BRNOBANKMAP")
dfn = "F_CM_AFA_BRNOBANKMAP/" + V_DT + ".parquet"
F_CM_AFA_BRNOBANKMAP.cache()
nrows = F_CM_AFA_BRNOBANKMAP.count()
F_CM_AFA_BRNOBANKMAP.write.save(path=hdfs + '/' + dfn, mode='overwrite')
F_CM_AFA_BRNOBANKMAP.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/F_CM_AFA_BRNOBANKMAP/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert F_CM_AFA_BRNOBANKMAP lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
Ejemplo n.º 31
0
       ,V_DT                    AS ODS_ST_DATE 
       ,A.FR_ID                 AS FR_ID 
   FROM OCRM_F_CI_CUST_DESC A                                  --统一客户信息
   LEFT JOIN ACRM_F_CI_CUST_CONTRIBUTION B                     --
     ON A.CUST_ID               = B.CUST_ID 
    AND A.FR_ID                 = B.FR_ID 
    AND B.ODS_DATE              = V_DT 
   LEFT JOIN ACRM_F_CI_LOYALTY_INFO C                          --
     ON A.CUST_ID               = C.CUST_ID 
    AND A.FR_ID                 = C.FR_ID 
    AND C.ETL_DATE              = V_DT 
  GROUP BY A.CUST_ID 
       ,A.OBJ_RATING 
       ,A.SUB_RATING 
       ,A.FR_ID """

sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
ACRM_A_ANALYSIS_INFO = sqlContext.sql(sql)
ACRM_A_ANALYSIS_INFO.registerTempTable("ACRM_A_ANALYSIS_INFO")
dfn="ACRM_A_ANALYSIS_INFO/"+V_DT+".parquet"
ACRM_A_ANALYSIS_INFO.cache()
nrows = ACRM_A_ANALYSIS_INFO.count()
ACRM_A_ANALYSIS_INFO.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ACRM_A_ANALYSIS_INFO.unpersist()
OCRM_F_CI_CUST_DESC.unpersist()
ACRM_F_CI_CUST_CONTRIBUTION.unpersist()
ACRM_F_CI_LOYALTY_INFO.unpersist()
ret = os.system("hdfs dfs -rm -r /"+dbname+"/ACRM_A_ANALYSIS_INFO/"+V_DT_LD+".parquet")
et = datetime.now()
print("Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_ANALYSIS_INFO lines %d") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds, nrows)
Ejemplo n.º 32
0
import os
import warnings
warnings.filterwarnings('ignore')
import pyspark.sql.functions as F
import pandas as pd
import tensorflow as tf

os.environ['SPARK_HOME'] = "C:\Software\spark\spark-2.4.4-bin-hadoop2.7"
os.environ['PYSPARK_PYTHON'] = "D:\Python_Tools\Anaconda\python.exe"

sparkConf = SparkConf().setMaster("local[*]").setAppName("ContentRecall").set(
    "spark.executor.memory", "1g")
sc = SparkContext(conf=sparkConf)
sqlContext = HiveContext(sc)

sqlContext.sql("use profile")
user_article_basic = sqlContext.sql(
    "select user_id, article_id, clicked from user_article_basic")
user_profile = sqlContext.sql(
    "select user_id, topic, weights from user_profile_topic")
user_profile = user_profile.groupBy(['user_id']).agg(
    F.collect_list('weights')).withColumnRenamed('collect_list(weights)',
                                                 'user_weights')

train_data = user_article_basic.join(user_profile, on=['user_id'], how='left')

sqlContext.sql("use article")
article_vector = sqlContext.sql("select * from article_vector")

train_data = train_data.join(article_vector, on=['article_id'], how='left')
# train_data.show()
Ejemplo n.º 33
0
       ,A.ISRELATVIE            AS ISRELATVIE 
       ,A.CUSTOMERCLASSIFY      AS CUSTOMERCLASSIFY 
       ,A.CUSTOMERCLASSIFYDATE  AS CUSTOMERCLASSIFYDATE 
       ,A.ENTINDUSTRIALSCALE    AS ENTINDUSTRIALSCALE 
       ,A.ENTRETAILSCALE        AS ENTRETAILSCALE 
       ,A.ORGCREDITCODE         AS ORGCREDITCODE 
       ,A.GROUPTYPE             AS GROUPTYPE 
       ,A.CORPIDASDATE          AS CORPIDASDATE 
       ,A.FR_ID                 AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'LNA'                   AS ODS_SYS_ID 
   FROM O_CI_XDXT_ENT_INFO A                                   --企业基本信息
"""

sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
F_CI_XDXT_ENT_INFO_INNTMP1 = sqlContext.sql(sql)
F_CI_XDXT_ENT_INFO_INNTMP1.registerTempTable("F_CI_XDXT_ENT_INFO_INNTMP1")

#F_CI_XDXT_ENT_INFO = sqlContext.read.parquet(hdfs+'/F_CI_XDXT_ENT_INFO/*')
#F_CI_XDXT_ENT_INFO.registerTempTable("F_CI_XDXT_ENT_INFO")
sql = """
 SELECT DST.CUSTOMERID                                          --客户编号:src.CUSTOMERID
       ,DST.CORPID                                             --法人或组织机构代码:src.CORPID
       ,DST.ENTERPRISENAME                                     --企业名称:src.ENTERPRISENAME
       ,DST.ENGLISHNAME                                        --客户英文名称:src.ENGLISHNAME
       ,DST.FICTITIOUSPERSON                                   --法人代表:src.FICTITIOUSPERSON
       ,DST.ORGNATURE                                          --机构类型:src.ORGNATURE
       ,DST.FINANCETYPE                                        --金融机构类型:src.FINANCETYPE
       ,DST.ENTERPRISEBELONG                                   --企业隶属:src.ENTERPRISEBELONG
       ,DST.INDUSTRYTYPE                                       --行业类型:src.INDUSTRYTYPE
       ,DST.INDUSTRYTYPE1                                      --行业类型1:src.INDUSTRYTYPE1
Ejemplo n.º 34
0
    AND B.INDEX_VALUE < C.INDEX_VALUE THEN 0 ELSE 1 END                     AS INDEX_VALUE 
       ,SUBSTR(V_DT, 1, 7)                       AS YEAR_MONTH 
       ,V_DT                    AS ETL_DATE 
       ,'1'                     AS CUST_TYPE 
       ,A.FR_ID                 AS FR_ID 
   FROM ACRM_A_TARGET_D004006 A                                --2个月前持有产品数
  INNER JOIN ACRM_A_TARGET_D004007 B                           --1个月前持有产品数
     ON A.CUST_ID               = B.CUST_ID 
    AND A.FR_ID                 = B.FR_ID 
  INNER JOIN ACRM_A_TARGET_D004008 C                           --当月持有产品数
     ON A.CUST_ID               = C.CUST_ID 
    AND A.FR_ID                 = C.FR_ID 
  WHERE A.CUST_TYPE             = '1' """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
ACRM_A_TARGET_D004029 = sqlContext.sql(sql)
ACRM_A_TARGET_D004029.registerTempTable("ACRM_A_TARGET_D004029")
dfn = "ACRM_A_TARGET_D004029/" + V_DT + ".parquet"
ACRM_A_TARGET_D004029.cache()
nrows = ACRM_A_TARGET_D004029.count()
ACRM_A_TARGET_D004029.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ACRM_A_TARGET_D004029.unpersist()
ACRM_A_TARGET_D004006.unpersist()
ACRM_A_TARGET_D004007.unpersist()
ACRM_A_TARGET_D004008.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_TARGET_D004029/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_TARGET_D004029 lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
Ejemplo n.º 35
0
       ,A.FORESIGHT             AS FORESIGHT 
       ,A.STATUS                AS STATUS 
       ,A.INDUSTRY              AS INDUSTRY 
       ,A.PROSECUTION           AS PROSECUTION 
       ,A.FIRSTINVESTSUM        AS FIRSTINVESTSUM 
       ,A.FIRSTINVESTDATE       AS FIRSTINVESTDATE 
       ,A.LASTINVESTSUM         AS LASTINVESTSUM 
       ,A.LASTINVESTDATE        AS LASTINVESTDATE 
       ,A.DEADLINE              AS DEADLINE 
       ,A.FR_ID                 AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'LNA'                   AS ODS_SYS_ID 
   FROM O_CI_XDXT_CUSTOMER_RELATIVE A                          --客户关联信息
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
F_CI_XDXT_CUSTOMER_RELATIVE = sqlContext.sql(sql)
F_CI_XDXT_CUSTOMER_RELATIVE.registerTempTable("F_CI_XDXT_CUSTOMER_RELATIVE")
dfn = "F_CI_XDXT_CUSTOMER_RELATIVE/" + V_DT + ".parquet"
F_CI_XDXT_CUSTOMER_RELATIVE.cache()
nrows = F_CI_XDXT_CUSTOMER_RELATIVE.count()
F_CI_XDXT_CUSTOMER_RELATIVE.write.save(path=hdfs + '/' + dfn, mode='overwrite')
F_CI_XDXT_CUSTOMER_RELATIVE.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname +
                "/F_CI_XDXT_CUSTOMER_RELATIVE/" + V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert F_CI_XDXT_CUSTOMER_RELATIVE lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
Ejemplo n.º 36
0
		,ORG_ID
		,CUST_ID
		,CUST_ZH_NAME
		,CUS_CONTRIBUTION_RMB
		,CRE_CONTRIBUTION_RMB
		,DEP_CONTRIBUTION_RMB
		,DEP_AMOUNT_RMB
		,CRE_AMOUNT_RMB
		,MAIN_TYPE
		,REPORT_DATE
		,CUST_TYP
		,MGR_ID
		,MGR_NAME
		,M_MAIN_TYPE
   FROM MCRM_COR_CUS_CON_ORG_REPORT_BE A                       --
"""
sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
MCRM_COR_CUS_CON_ORG_REPORT = sqlContext.sql(sql)
MCRM_COR_CUS_CON_ORG_REPORT.registerTempTable("MCRM_COR_CUS_CON_ORG_REPORT")
dfn = "MCRM_COR_CUS_CON_ORG_REPORT/" + V_DT + ".parquet"
MCRM_COR_CUS_CON_ORG_REPORT.cache()
nrows = MCRM_COR_CUS_CON_ORG_REPORT.count()
MCRM_COR_CUS_CON_ORG_REPORT.write.save(path=hdfs + '/' + dfn, mode='overwrite')
MCRM_COR_CUS_CON_ORG_REPORT.unpersist()
MCRM_COR_CUS_CON_ORG_REPORT_BE.unpersist()
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert MCRM_COR_CUS_CON_ORG_REPORT lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
Ejemplo n.º 37
0
#execfile('/data/w205Project/spark/getLinks.py') <-- don't use. use spark-submit instead.

from pyspark import SparkContext, HiveContext
sc = SparkContext()
sqlContext = HiveContext(sc)


from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window

sqlContext.sql("ADD JAR /data/w205Project/load/hive-serdes-1.0-SNAPSHOT.jar");
# sqlContext.sql("ADD JAR /usr/lib/hadoop/hadoop-aws.jar");
# sqlContext.sql("ADD JAR /usr/lib/hadoop/lib/aws-java-sdk-1.7.14.jar");

###############################################
#    EXTRACT ALL THE LINKS INDISCRIMINATELY   #
###############################################

'''
links = sqlContext.sql("select entities.urls.url[0] as tco, entities.urls.expanded_url[0] as link from tweets where entities.urls.url[0] IS NOT NULL");
uniqueLInks = links.dropDuplicates(['tco', 'link'])
uniqueLInks.repartition(1).save("s3n://w205twitterproject/links5","json")
'''


###############################################
#                  ANALYZE                    #
###############################################
Ejemplo n.º 38
0
       ,A.ODS_ST_DATE           AS ODS_ST_DATE 
       ,A.ODS_SYS_ID            AS ODS_SYS_ID 
   FROM F_DP_CBOD_LNLNSLNS A                                   --
  INNER JOIN(
         SELECT MAX(ETLDT)                       AS ETLDT 
               ,LN_LN_ACCT_NO 
               ,FR_ID 
           FROM F_DP_CBOD_LNLNSLNS 
          GROUP BY LN_LN_ACCT_NO 
               ,FR_ID) B                                      --
     ON A.LN_LN_ACCT_NO         = B.LN_LN_ACCT_NO 
    AND A.FR_ID                 = B.FR_ID 
    AND A.ETLDT                 = B.ETLDT """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
MID_DP_CBOD_LNLNSLNS = sqlContext.sql(sql)
MID_DP_CBOD_LNLNSLNS.registerTempTable("MID_DP_CBOD_LNLNSLNS")
dfn = "MID_DP_CBOD_LNLNSLNS/" + V_DT + ".parquet"
MID_DP_CBOD_LNLNSLNS.cache()
nrows = MID_DP_CBOD_LNLNSLNS.count()
MID_DP_CBOD_LNLNSLNS.write.save(path=hdfs + '/' + dfn, mode='overwrite')
MID_DP_CBOD_LNLNSLNS.unpersist()
#全量表保存后需要删除前一天数据
ret = os.system("hdfs dfs -rm -r /" + dbname + "/MID_DP_CBOD_LNLNSLNS/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert MID_DP_CBOD_LNLNSLNS lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
#任务[21] 001-01::
V_STEP = V_STEP + 1

sql = """
	select 
			A.ODS_CUST_TYPE 	as CUST_TYPE,
			A.ODS_CUST_ID 		as CUST_ID,
			A.FR_ID 			as FR_ID,
			CAST(COUNT(1) AS DECIMAL(22,2)) 	as INDEX_VALUE_COUNT,
			CAST(YEAR(V_DT)-YEAR(MIN(A.BEGIN_DATE)) AS DECIMAL(22,2)) as INDEX_VALUE_YEAR,
			V_YEAR_MONTH 						as YEAR_MONTH,
			V_DT 								as ETL_DATE,
			MIN(BEGIN_DATE) 					as BEGIN_DATE
		FROM OCRM_F_CI_SYS_RESOURCE A 
		  WHERE (A.ODS_SYS_ID = 'IBK'
						OR A.ODS_SYS_ID = 'MBK' 
						OR A.ODS_SYS_ID = 'MSG' 
						OR A.ODS_SYS_ID = 'APY' 
						) AND COALESCE(A.END_DATE,'2099-12-31') > V_DT
				GROUP BY A.ODS_CUST_ID,A.ODS_CUST_TYPE,A.FR_ID
	   """

sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
sql = re.sub(r"\bV_YEAR_MONTH\b", "'"+V_YEAR_MONTH+"'", sql)
OCRM_A_RESOURCE_EBANK_TARGET = sqlContext.sql(sql)
dfn="OCRM_A_RESOURCE_EBANK_TARGET/"+V_DT+".parquet"
OCRM_A_RESOURCE_EBANK_TARGET.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ret = os.system("hdfs dfs -rm -r /"+dbname+"/OCRM_A_RESOURCE_EBANK_TARGET/"+V_DT_LD+".parquet")
et = datetime.now()
print("Step %d start[%s] end[%s] use %d seconds") % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"), (et-st).seconds)
       ,A.RELATIVEGROUPSERIALNO AS RELATIVEGROUPSERIALNO 
       ,A.LIMITATIONTERM        AS LIMITATIONTERM 
       ,A.MIGRATEFLAG           AS MIGRATEFLAG 
       ,A.PUTOUTORGID           AS PUTOUTORGID 
       ,A.ISINSURANCE           AS ISINSURANCE 
       ,A.ACTUALBAILSUM         AS ACTUALBAILSUM 
       ,A.BAILEXCHANGERATE      AS BAILEXCHANGERATE 
       ,A.FR_ID                 AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'LNA'                   AS ODS_SYS_ID 
       ,A.SUBBILLTYPE           AS SUBBILLTYPE 
   FROM O_LN_XDXT_BUSINESS_CONTRACT A                          --业务合同信息
"""

sql = re.sub(r"\bV_DT\b", "'"+V_DT10+"'", sql)
F_LN_XDXT_BUSINESS_CONTRACT_INNTMP1 = sqlContext.sql(sql)
F_LN_XDXT_BUSINESS_CONTRACT_INNTMP1.registerTempTable("F_LN_XDXT_BUSINESS_CONTRACT_INNTMP1")

#F_LN_XDXT_BUSINESS_CONTRACT = sqlContext.read.parquet(hdfs+'/F_LN_XDXT_BUSINESS_CONTRACT/*')
#F_LN_XDXT_BUSINESS_CONTRACT.registerTempTable("F_LN_XDXT_BUSINESS_CONTRACT")
sql = """
 SELECT DST.SERIALNO                                            --流水号:src.SERIALNO
       ,DST.RELATIVESERIALNO                                   --相关申请流水号:src.RELATIVESERIALNO
       ,DST.ARTIFICIALNO                                       --人工编号:src.ARTIFICIALNO
       ,DST.OCCURDATE                                          --发生日期:src.OCCURDATE
       ,DST.CUSTOMERID                                         --客户编号:src.CUSTOMERID
       ,DST.CUSTOMERNAME                                       --客户名称:src.CUSTOMERNAME
       ,DST.BUSINESSTYPE                                       --业务品种:src.BUSINESSTYPE
       ,DST.OCCURTYPE                                          --发生类型:src.OCCURTYPE
       ,DST.CREDITCYCLE                                        --额度是否循环:src.CREDITCYCLE
       ,DST.CREDITAGGREEMENT                                   --使用授信协议号:src.CREDITAGGREEMENT
Ejemplo n.º 41
0
       ,CI_CER_NO_1             AS CI_CER_NO_1 
       ,CI_FULL_NAM             AS CI_FULL_NAM 
       ,CI_NEW_CUST_NO          AS CI_NEW_CUST_NO 
       ,CI_CANCEL_FLG           AS CI_CANCEL_FLG 
       ,CI_CUST_TYP             AS CI_CUST_TYP 
       ,CI_CRT_SYS              AS CI_CRT_SYS 
       ,CI_CRT_SCT_N            AS CI_CRT_SCT_N 
       ,CI_CRT_ORG              AS CI_CRT_ORG 
       ,CI_CRT_OPR              AS CI_CRT_OPR 
       ,CI_UPD_SYS              AS CI_UPD_SYS 
       ,CI_UPD_ORG              AS CI_UPD_ORG 
       ,FR_ID                   AS FR_ID 
       ,V_DT                    AS ODS_ST_DATE 
       ,'CEN'                   AS ODS_SYS_ID 
   FROM O_CI_CBOD_CICIFUNN A                                   --对私合并档
"""

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
F_CI_CBOD_CICIFUNN = sqlContext.sql(sql)
F_CI_CBOD_CICIFUNN.registerTempTable("F_CI_CBOD_CICIFUNN")
dfn = "F_CI_CBOD_CICIFUNN/" + V_DT + ".parquet"
F_CI_CBOD_CICIFUNN.cache()
nrows = F_CI_CBOD_CICIFUNN.count()
F_CI_CBOD_CICIFUNN.write.save(path=hdfs + '/' + dfn, mode='overwrite')
F_CI_CBOD_CICIFUNN.unpersist()
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert F_CI_CBOD_CICIFUNN lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)
Ejemplo n.º 42
0
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf
from pyspark import HiveContext
from pyspark import SparkContext

from eod import msft,candidate,download,merge,summary,spx

def run(sc, sql_context, is_hive):
    msft.main(sc, sql_context, is_hive)
    candidate.main(sc, sql_context, is_hive)
    download.main(sc, sql_context, is_hive)
    merge.main(sc, sql_context, is_hive)
    spx.main(sc, sql_context, is_hive)
    print "------------------summary--------------------"
    summary.main(sc, sql_context, is_hive)

if __name__ == '__main__':
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores",     "4")
    conf.set("spark.executor.memory",    "32g")

    sc = SparkContext(appName="bintrade_candidate.eod.eod_run", master="yarn-client", conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)
Ejemplo n.º 43
0
#任务[21] 001-01::
V_STEP = V_STEP + 1

sql = """
 SELECT CUST_ID                 AS CUST_ID 
       ,PRODUCT_ID              AS PRODUCT_ID 
       ,'DK'                    AS TYPE 
       ,FR_ID                   AS FR_ID 
   FROM ACRM_F_CI_ASSET_BUSI_PROTO A                           --资产协议表
  WHERE BAL > 0 
  GROUP BY CUST_ID 
       ,PRODUCT_ID 
       ,FR_ID """

sql = re.sub(r"\bV_DT\b", "'" + V_DT10 + "'", sql)
ACRM_A_CI_CUST_PROD = sqlContext.sql(sql)
ACRM_A_CI_CUST_PROD.registerTempTable("ACRM_A_CI_CUST_PROD")
dfn = "ACRM_A_CI_CUST_PROD/" + V_DT + ".parquet"
ACRM_A_CI_CUST_PROD.cache()
nrows = ACRM_A_CI_CUST_PROD.count()
ACRM_A_CI_CUST_PROD.write.save(path=hdfs + '/' + dfn, mode='overwrite')
ACRM_A_CI_CUST_PROD.unpersist()
ret = os.system("hdfs dfs -rm -r /" + dbname + "/ACRM_A_CI_CUST_PROD/" +
                V_DT_LD + ".parquet")
et = datetime.now()
print(
    "Step %d start[%s] end[%s] use %d seconds, insert ACRM_A_CI_CUST_PROD lines %d"
) % (V_STEP, st.strftime("%H:%M:%S"), et.strftime("%H:%M:%S"),
     (et - st).seconds, nrows)

#任务[11] 001-02::
import sys
from pyspark import SparkContext, SparkConf, HiveContext, SQLContext

if __name__ == '__main__':
    conf = SparkConf().setAppName("Plotly Exports")
    sc = SparkContext(conf=conf)
    hive_context = HiveContext(sc)
    print '=== Creating Database ==='
    hive_context.sql('CREATE DATABASE PLOTLY')
    hive_context.sql('USE PLOTLY')

    print '=== Creating Table ==='
    hive_context.sql("CREATE TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010 "
                     "(LOCATION STRING, ALCOHOL FLOAT) ROW FORMAT "
                     "DELIMITED FIELDS TERMINATED BY ',' "
                     "TBLPROPERTIES (\"skip.header.line.count\"=\"1\")")
    print "=== loading data into table ==="
    hive_context.sql("LOAD DATA LOCAL INPATH "
                     "'/plotly_datasets/2010_alcohol_consumption_by_country.csv' "
                     "OVERWRITE INTO TABLE ALCOHOL_CONSUMPTION_BY_COUNTRY_2010")
    sys.exit()
Ejemplo n.º 45
0
from pyspark import SparkContext, SparkConf
from pyspark import HiveContext

conf = SparkConf().setAppName("revenueByDaySQL")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
hiveContext.sql("use retail_db")
sqlString = "SELECT 	orders.order_date, \
			ROUND(SUM(order_items.order_item_subtotal), 2) the_sum, \
			COUNT(DISTINCT orders.order_id) the_count\
		FROM orders, order_items \
		WHERE orders.order_id = order_items.order_item_order_id \
		GROUP BY orders.order_date \
		ORDER BY the_sum"

joinded_aggregate_data = hiveContext.sql(sqlString)

print str(joinded_aggregate_data.take(5))












Ejemplo n.º 46
0
# author [email protected]
import os
import sys

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from ml import  diff_feature_reg,diff_train

def run(sc, sql_context, is_hive):
    diff_feature_reg.main(sc, sql_context, is_hive = True)
    diff_train.main(sc, sql_context, is_hive = True)

if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")

    sc = SparkContext(appName="bintrade_candidate", master="yarn-client", conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)
from pyspark import SparkContext, SparkConf, HiveContext

conf = SparkConf().setAppName("most expensive product using SQL")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)

sqlString = 	"SELECT p.product_name, p.product_price \
		FROM retail_db.products p \
		JOIN 	(SELECT max(products.product_price) max_id \
			FROM retail_db.products) the_max \
			ON \
			p.product_price = the_max.max_id"

result = hiveContext.sql(sqlString)

print("***********************\n{0}".format(str(result.take(1))))