class SparkConnection: def __init__(self, ip="local[*]", dir_root="", folds=5): self._spark_ip = ip # "192.168.240.10" if "local[*]" == ip: self._spark_url = ip if dir_root: self._dir_root = dir_root else: current_file = os.path.dirname(os.path.abspath(__file__)) self._dir_root = os.path.dirname(current_file) else: self._spark_url = "spark://" + self._spark_ip + ":7077" self._dir_root = "hdfs://" + self._spark_ip self._folds = folds conf = SparkConf() # conf.set("spark.executor.memory", "8g") conf.set("spark.network.timeout", "2000") conf.set("spark.sql.broadcastTimeout", "300000") conf.set("spark.driver.memory", "6g") # conf.set("spark.sql.shuffle.partitions", "400") # conf.set("spark.yarn.executor.memoryOverhead", "256m") # conf.set("spark.dynamicAllocation.enabled","true") # conf.set("spark.shuffle.service.enabled", "true") # conf.set("spark.local.dir", "/yelp-dataset/spark-tmp") # conf.set("spark.driver.maxResultSize","10g") # sc = SparkContext("local[*]", "Simple App", conf=conf) # sc.setCheckpointDir('/tmp') # os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6' os.environ['PYSPARK_PYTHON'] = 'C:/Python36/python.exe' conf.setMaster(self._spark_url) self.sc = SparkContext(conf=conf) self.sc.setLogLevel("ERROR") self.sql_sc = SQLContext(self.sc) def read_json(self, path): return self.sql_sc.read.json(os.path.join(self._dir_root, path)) def read_parquet(self, path): return self.sql_sc.read.parquet(os.path.join(self._dir_root, path)) def write_parquet(self, df, path, overwrite=True): if overwrite: df.write.mode("overwrite").parquet(os.path.join(self._dir_root, path)) else: df.write.parquet(os.path.join(self._dir_root, path)) def clear(self): self.sql_sc.clearCache() def read_csv(self, path): return self.sql_sc.read.csv(os.path.join(self._dir_root, path), header=True, multiLine=True, escape="\"", quote="\"") def write_csv(self, df, path, overwrite=True): if overwrite: df.write.mode("overwrite").csv(os.path.join(self._dir_root, path), header=True) else: df.write.csv(os.path.join(self._dir_root, path), header=True)
class Credit: def __init__(self): self.conf = (SparkConf() .setAppName("CREDIT") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') self.base = 'hdfs://master:9000/gmc/' def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def sql_operate(self, sql, rdd, once_size=1000): temp = [] for row in rdd.collect(): # print(row) if len(temp) >= once_size: self.mysql_helper.executemany(sql, temp) temp.clear() temp.append(row) if len(temp) != 0: self.mysql_helper.executemany(sql, temp) temp.clear() def prepare_fpgrowth_data(self): tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR', 'MER_CAT_CD') \ .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013") result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey() def m(x): k = x[0] l = list(x[1]) v = set() for i in l: v.add(i[0]) return set(v) result = result.map(m) for i in result.take(10): print(i) model = FPGrowth.train(result, minSupport=0.05, numPartitions=10) result = model.freqItemsets().collect() for r in result: print(r) def cycle_credit(self): ''' 信用卡聚类数据预处理 :return: ''' print('---------------------------信用卡-Start--------------------------') # 交易流水 credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'MONTH_NBR', 'BILL_AMT', 'BILL_AMTFLAG').filter( "BILL_AMTFLAG ='-'").cache() # 卡账户信息 credit_acct_df = self.load_from_mysql('ACCT_D').select('ACCTNBR', 'MONTH_NBR', 'STM_MINDUE') # 还款计算 return_amt = credit_tran_df.groupBy('ACCTNBR', 'MONTH_NBR').sum('BILL_AMT') return_amt = return_amt.select('ACCTNBR', 'MONTH_NBR', return_amt['sum(BILL_AMT)'].alias('RETURNED')) # 去除0最低还款额,即未消费的账单月 join = credit_acct_df.join(return_amt, ['ACCTNBR', 'MONTH_NBR'], 'outer').filter('STM_MINDUE != 0') # 清除缓存 self.sqlctx.clearCache() def which_cycle_type(line): mindue = line['STM_MINDUE'] returned = line['RETURNED'] ''' 0:normal,all returned 1:cycle credit 2:overdue,don't return money ''' if mindue is not None and returned is None: flag = 2 elif returned >= mindue * 10: flag = 0 elif returned > mindue and returned < mindue * 10: flag = 1 else: flag = 9 return Row(ACCTNBR=int(line['ACCTNBR']), MONTH_NBR=line['MONTH_NBR'], DUE_FLAG=flag, STM_MINDUE=line['STM_MINDUE']) # 返回为PipelinedRDD join = join.map(which_cycle_type) # 转为DataFrame join = self.sqlctx.createDataFrame(join) ''' +-------+--------+-----+ | ACCTNBR | DUE_FLAG | count | +-------+--------+-----+ | 608126 | 2 | 1 | | 608126 | 0 | 6 | | 608868 | 0 | 4 | ''' # 按还款类型分类 each_type = join.groupBy(['ACCTNBR', 'DUE_FLAG']) # 计算每种还款情况数量 each_type_count = each_type.count() # 计算每种还款情况的最低还款额之和 each_type_mindue_sum = each_type.sum('STM_MINDUE') # 计算还款情况总数 all_type_count = each_type_count.groupBy('ACCTNBR').sum('count') # join 上述三表 rate = each_type_count.join(each_type_mindue_sum, ['ACCTNBR', 'DUE_FLAG'], 'outer').join(all_type_count, 'ACCTNBR', 'outer') # print(rate.columns) # ['ACCTNBR', 'DUE_FLAG', 'count', 'sum(STM_MINDUE)', 'sum(count)'] # 筛选出循环信用的数据 # TODO 暂时只取了循环信用的 rate = rate.filter(rate['DUE_FLAG'] == 1) # 计算进入循环信用的比例 rate = rate.select('ACCTNBR', (rate['sum(STM_MINDUE)'] * 10).alias('CYCLE_AMT'), rate['count'].alias('CYCLE_TIMES'), (rate['count'] / rate['sum(count)']).alias('CYCLE_RATE')) # rate.show() # print(rate.count()) def m(line): return line['CYCLE_TIMES'], line['CYCLE_AMT'], line['CYCLE_RATE'], line['ACCTNBR'] sql = "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=%s,CYCLE_AMT=%s,CYCLE_RATE=%s where ACCTNBR=%s" df = rate.map(m) print('将数据更新至数据库...') self.sql_operate(sql, df) # 将未进入循环的 设为0 print('将未进入循环的 设为0...') self.mysql_helper.execute( "update t_CMMS_TEMP_KMEANS_CREDIT set CYCLE_TIMES=0,CYCLE_AMT=0,CYCLE_RATE=0 where CYCLE_TIMES is null ") def losing_warn(self,year,month): # #计算月份 # if season == 1: # months_now = [1,2,3] # months_before = [10,11,12] # year_before = year - 1 # else: # months_now = [season * 3 - 2, season * 3 - 1, season * 3] # months_before = [season * 3 - 5, season * 3 - 4, season * 3-3] # year_before = year # # # # # 抽取每个季度数据 # # for m in months_now: # 最近一个月 month = '%02d' % month for i in range(2,29): day = '%02d' % i date = str(year) + month + day print(date) sql = "select MONTH_NBR from t_CMMS_CREDIT_TRAN where INP_DATE = %s limit 1" try: month_nbr = self.mysql_helper.fetchone(sql,(date,)) if month_nbr is None: continue else: month_nbr = int(month_nbr[0]) break except Exception: continue if month_nbr is None: raise Exception("There is no data in database for month:%s" % month) else: print('the latest month_nbr is %s' % month_nbr) months_now = [month_nbr-2,month_nbr-1,month_nbr] months_before = [month_nbr-5,month_nbr-4,month_nbr-3] print('months_now',months_now) print('months_before',months_before) # 交易流水 credit_tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').select('ACCTNBR', 'INP_DATE', 'MONTH_NBR', 'BILL_AMT', 'BILL_AMTFLAG').filter("BILL_AMTFLAG ='+'").cache() # 筛选出两个季度对应的流水 months_now_filter = None months_before_filter = None for i in months_now: f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i) if months_now_filter is None: months_now_filter = f else: months_now_filter = months_now_filter.unionAll(f) for i in months_before: f = credit_tran_df.filter(credit_tran_df['MONTH_NBR'] == i) if months_before_filter is None: months_before_filter = f else: months_before_filter = months_before_filter.unionAll(f) months_now_filter.groupBy('MONTH_NBR').count().show() months_before_filter.groupBy('MONTH_NBR').count().show() months_now_count = months_now_filter.groupBy('ACCTNBR').count() months_before_count = months_before_filter.groupBy('ACCTNBR').count() months_now_count.show() months_before_count.show() join = months_now_count.select('ACCTNBR',months_now_count['count'].alias('NOW_COUNT')).join( months_before_count.select('ACCTNBR',months_before_count['count'].alias('BEFORE_COUNT')),'ACCTNBR','outer' ) join.show() def m(line): ncount = line['NOW_COUNT'] bcount = line['BEFORE_COUNT'] ''' 计算增长率 9999:两季度均无数据 8888:仅第一季度有数据,流失客户 7777:仅第二季度有数据,新增客户 其他数字:第二个季度较第一季度增长率 (s2-s1)/s1*100 ''' if ncount is None: if bcount is None:# n none,b none pass else:# n none, b not none increment = -9999 else: if bcount is None:# n not none, b none increment = 8888 else:# n not none,b not none increment = round((ncount-bcount)/bcount*100) ''' 计算信用卡生命周期(以增长率计算) 100+ 快速增长 50-100 增长 -50-50 稳定 -50- 衰退 9999 流失 ''' if increment > 100 and increment != -9999: life = 1 # fast growing elif increment <=100 and increment >50: life = 2 # growing elif increment <= 50 and increment > -50: life = 3 # stable elif increment <= -50: life = 4 # losing else: life = 9 # no more tran completely lost return line['ACCTNBR'],month_nbr,increment,life sql = "replace into t_CMMS_ANALYSE_CREDIT(ACCTNBR, MONTH_NBR, INCREMENT,LIFE, UPDATE_TIME) values(%s,%s,%s,%s,now())" rdd = join.map(m) print(type(rdd)) self.sql_operate(sql,rdd)
class DataHandler: def __init__(self): self.conf = (SparkConf() .setAppName("BandCard") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def prepare_life_cycle(self, year, season): ''' 准备生命周期数据 从t_CMMS_ASSLIB_ASSET中获取每日AUM数据 prepare data saum1 (last season sum aum) saum2 (current season sum aum) aum_now account_age (months) last_tr_date (days) :param year: :param season: 1,2,3,4 :return: ''' # 计算月份 print('----------------------生命周期-Start----------------------') print('开始准备生命周期数据...') print('开始计算月份') if season == 1: # date1 当前季度月份 date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03'] # date2 上一季月份 date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12'] elif season == 4: date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12'] date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9'] else: date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1), str(year) + '-0' + str(3 * season)] date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4), str(year) + '-0' + str(3 * season - 3)] print('当前季度月份 new:', date1) print('上一季度月份 old:', date2) # 加载AUM表 aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache() # 拼接每季度三个月断数据 season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll( aum.filter(aum.STAT_DAT == date1[2])) season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll( aum.filter(aum.STAT_DAT == date2[2])) # 计算每季度AUM aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1') aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2') # 两个季度进行外联接 ''' +-----------+---------+---------+ | CUST_NO|sum(AUM2)|sum(AUM1)| +-----------+---------+---------+ |81005329523| null|294844.59| |81011793167| null| 365.20| |81015319088| null| 9640.96| +-----------+---------+---------+ ''' union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer') # 筛选当前AUM temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort( 'CUST_NO').sort(aum.STAT_DAT.desc()) temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in') aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO" aum_now = self.sqlctx.sql(aum_now_sql) # 清除缓存表 self.sqlctx.dropTempTable('group_in') # 联合 union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer') # 计算用户开户至今时间(months) # 载入账户表 account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache() account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account') account_age_aql = "select CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE from " \ "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \ "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO" account_age = self.sqlctx.sql(account_age_aql) # calculate last tran date account_1 = account.select('CUST_NO', 'ACC_NO15') detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT') a_d = account_1.join(detail, 'ACC_NO15', 'outer') a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable') last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO" last_tr_date = self.sqlctx.sql(last_tr_date_sql) # 联合 season aum_now account_age last_tr_date unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer') # 清除缓存表 self.sqlctx.dropTempTable('account') self.sqlctx.dropTempTable('adtable') self.sqlctx.clearCache() # 结果插入表 print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...') insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)" # 缓冲区 temp = [] for row in unions.collect(): row_dic = row.asDict() if len(temp) >= 1000: # 批量写入数据库 self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() # 加载数据到缓冲区 try: # 计算增长率 increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)'] except Exception: increase = 0 # 计算开户时长(月份数) 若无则视为6个月以上 if row_dic['ACCOUNT_AGE'] is None: row_dic['ACCOUNT_AGE'] = 7 # 最后交易日期 ltd = row_dic['LAST_TR_DATE'] if ltd is not None: try: ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') except Exception: ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:] ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') days = (datetime.datetime.now() - ltd).days else: days = 366 temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase, row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days)) if len(temp) != 0: self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() def calculate_life_cycle(self): ''' 根据AUM变化情况计算生命周期阶段 calculate life cycle period :return: ''' print('开始计算生命周期...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache() def clcmap(line): cust_no = line['CUST_NO'] account_age = line['ACCOUNT_AGE'] last_tr_date = line['LAST_TR_DATE'] aum_now = line['AUM_NOW'] increase = line['INCREASE'] period = 0 if aum_now is None: period = 9 # 未知 elif aum_now < 1000 and last_tr_date > 365: period = 3 # 流失期 else: if increase > 20 or account_age < 6: period = 0 # 成长期 elif increase >= -20 and increase <= 20: period = 1 # 成熟期 else: period = 2 # 稳定期 return period, cust_no map_result = life_cycle.map(clcmap).collect() # clear the life_cycle cache self.sqlctx.clearCache() temp = [] print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...') update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s" for row in map_result: if len(temp) >= 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() temp.append(row) if len(temp) != 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() def lifecycle_to_real_table(self, year, season): ''' 将生命周期数据写入正式表中 put life_cycle tmp table to real table :return: ''' print('开始将生命周期数据写入正式表中...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM') union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache() temp = [] sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())" quarter = str(year) + '-' + str(season) for row in union.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(sql, temp) temp.clear() cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0' temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter)) if len(temp) != 1000: self.mysql_helper.executemany(sql, temp) temp.clear() self.sqlctx.clearCache() def run_life_cycle(self,year,season): ''' 运行完整的生命周期流程 1 准备生命周期数据,计算AUM及其变化幅度 2 根据变化幅度计算生命周期阶段 3 将数据从缓存表放到实际表 :param year: :param season: :return: ''' self.prepare_life_cycle(year,season) self.calculate_life_cycle() self.lifecycle_to_real_table(year,season) #------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------# def customer_value(self, year, half_year): ''' 计算客户价值 calculate customer value :param year: which year to calculate :param half_year: 0 for month 1-6 , 1 for month 7-12 :return: ''' print('---------------------------客户价值-Start--------------------------') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache() aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache() base = half_year * 6 aum_slot_filter = None for i in range(1, 7): i = base + i if i < 10: i = '0' + str(i) else: i = str(i) slot = str(year) + '-' + i slot_filter = aum.filter(aum.STAT_DAT == slot) if aum_slot_filter is None: aum_slot_filter = slot_filter else: aum_slot_filter = aum_slot_filter.unionAll(slot_filter) # CUST_NO sum(AUM) huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter( aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ') dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter( aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ') # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ) j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer') # j.show() # 清除原有数据 self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE') # 开始联合其他表 all_col = j.join(cust_info, 'CUST_NO', 'outer') print(j.count(), cust_info.count()) # all_col.show() #根据客户价值计算客户等级 def calculate_rank(value): if value < 1000: return 0 elif value < 10000: return 1 elif value < 100000: return 2 elif value < 500000: return 3 elif value < 2000000: return 4 elif value < 5000000: return 5 else: return 6 temp = [] print('将数据replace到正式表...') update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())" for row in all_col.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0 val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0 cust_val = float(val_dq) + float(val_hq) cust_rank = calculate_rank(cust_val) slot = str(year) + '-' + str(half_year) cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1 temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot)) if len(temp) != 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() def aum_total(self): ''' 计算AUM总和 data for t_CMMS_ASSLIB_ASSTOT :return: ''' print('---------------------------总资产-Start--------------------------') # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR', 'ACC_NAM').cache() # print(df_asset.count(), df_asset.columns) other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct() # print(other_col.count(),other_col.columns) aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM') # print(aum.count(), aum.columns) aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort( ['CUST_NO', aum.STAT_DAT.desc()]) \ .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'}) # print(aum.count(), aum.columns) total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \ join(other_col, 'CUST_NO', 'left_outer').distinct() # total.filter(total.STAT_DAT == '2016-06-') .show() # prepare params def list_map(line): return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM'] df = total.map(list_map) # clear old data self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT') sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)" # execute sql self.mysql_helper.batch_operate(sql, df, 100) def debt_total(self): ''' prepare data for total debt :return: ''' print('---------------------------总负债-Start--------------------------') df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM', 'BAL_AMT', 'GRANT_AMT', 'CUR') df_debt = df_debt.filter(df_debt.LOAN_ACC != '') df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT') df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'}) df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct() stat_dat = datetime.datetime.now().strftime('%Y%m%d') def m(line): return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[ 'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat df = df_total.map(m) sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)" self.mysql_helper.batch_operate(sql, df) def run(self): # 生命周期 年份 季度1,2,3,4 dh.run_life_cycle(2016, 2) # 客户价值 上半年:0,下半年:1 dh.customer_value(2016, 0) # 总资产 dh.aum_total() # 总负债 dh.debt_total()
# In[6]: ########################################################### # Crear Folds Con Ratings Originales ########################################################### if CREATE_FOLDS: for fold in range(5): print("Building Fold {}".format(fold)) ratings, test_ratings = train_test_split_randomized( all_ratings, fold, FOLDS) ratings.write.mode("overwrite").parquet(DIR_ROOT + "/fold_" + str(fold) + "/train") test_ratings.write.mode("overwrite").parquet(DIR_ROOT + "/fold_" + str(fold) + "/test") sql_sc.clearCache() # # Basados en Rating # In[7]: def ratingBasedMetrics(ratings): ratings_quad = ratings.select( "user_id", "business_id", "stars").withColumn( "stars_quad", col("stars") * col("stars")).alias("user_business_rating") sum_stars = ratings_quad.groupBy("user_id").agg( sum_sql("stars_quad").alias("sum_quad_stars"), count(lit(1)).alias("nr") ) \