def daily_compute(self, startdate=None, enddate=None): sql = """ select trade_id,pre_asset_val,pre_debt_val,now_asset_val, now_debt_val,capital_in,capital_out, case when qty_exception>return_rate_exception then qty_exception else return_rate_exception end exception_label, long_return,short_return,total_return, long_return/{3} long_return_rate, short_return/{3} short_return_rate, total_return/{3} total_return_rate, int_tax_in, int_tax_out, '{0}' busi_date, pmod(hash(trade_id),{4}) part from ( select trade_id, sum(case when trd_type='long_related' then pre_mkt_val else 0 end ) pre_asset_val, sum(case when trd_type='short_related' then pre_mkt_val else 0 end ) pre_debt_val, sum(case when trd_type='long_related' then now_mkt_val else 0 end ) now_asset_val, sum(case when trd_type='short_related' then now_mkt_val else 0 end ) now_debt_val, sum(capital_in) capital_in, sum(capital_out) capital_out, max(qty_exception) qty_exception, max(return_rate_exception) return_rate_exception, sum(case when trd_type='long_related' and qty_exception=return_rate_exception and qty_exception=0 then return else 0 end ) long_return, sum(case when trd_type='short_related' and qty_exception=return_rate_exception and qty_exception=0 then return else 0 end ) short_return, sum(case when qty_exception=return_rate_exception and qty_exception=0 then return else 0 end) total_return, sum(int_tax_in) int_tax_in, sum(int_tax_out) int_tax_out from {1}.{2} where busi_date='{0}' group by trade_id,busi_date ) a """ selectSql = sql.format( enddate, self.fdata, self.stock_daily_check_data, """ greatest(greatest(pre_asset_val+pre_debt_val,-1 * pre_debt_val), greatest(pre_asset_val+pre_debt_val,-1 * pre_debt_val)+capital_in+ int_tax_in, greatest(pre_asset_val+pre_debt_val,-1 * pre_debt_val)+capital_in+ capital_out+int_tax_in+int_tax_out) """, self.part_numbers) dfLong = self.sparkSession.sql(selectSql).repartition(20) save_data(self.sparkSession, self.adata, self.stock_cust_daily_return, enddate, dfLong, partitonByName=["busi_date", "part"])
def _cal_detail(self, finalDf, enddate): finalDfData = finalDf.where(""" pl_r_sum_exception>0 or pl_trd_sum_exception>0 or rr_sum_exception>0 """) save_data(self.sparkSession, self.adata, self.taget_detail_table, enddate, finalDfData)
def daily_compute(self, startdate=None, enddate=None): sqlAllCount = """ select count(distinct trade_id) all_num from {1}.{2} where busi_date= '{0}' """.format(enddate, self.adata, self.stock_cust_daily_return) allNum = self.sparkSession.sql(sqlAllCount).first().all_num sqlTmp = """ select busi_date, sum(ac_exception_label) ac_exception_uv, cast(sum(ac_exception_label)*1.0/{3} as double) ac_exception_rate, sum(pos_neg_exception) pos_neg_exception_uv, cast(sum(pos_neg_exception)*1.0/{3} as double) pos_neg_exception_rate, sum(detail_sum_exception) detail_sum_exception_uv, cast(sum(detail_sum_exception)*1.0/{3} as double) detail_sum_exception_rate, sum(rr_outlier_exception) rr_outlier_exception_uv, cast(sum(rr_outlier_exception)*1.0/{3} as double) rr_outlier_exception_rate, sum(rr_sp_exception) rr_sp_exception_uv, cast(sum(rr_sp_exception)*1.0/{3} as double) rr_sp_exception_rate from {1}.{2} where busi_date= '{0}' group by busi_date """.format(enddate, self.adata, self.stock_dr_check_exception_data, allNum) df = self.sparkSession.sql(sqlTmp).repartition(5) save_data(self.sparkSession, self.adata, self.stock_dr_check_exception_report, enddate, df)
def daily_compute(self, startdate=None, enddate=None): sql = """ select trade_id, secu_acc_id, prd_no, busi_date, pre_qty, trd_qty, now_qty, pre_mkt_val, now_mkt_val, trd_cash_flow, pos_cash_flow, capital_in, capital_out, nvl(busi_flag_code,'') busi_flag_code, case when qty_exception>0 and return_rate_exception>0 then 'both' when qty_exception>0 then 'qty' when return_rate_exception>0 then 'return_rate' end exception_type, trd_type from {1}.{2} where busi_date='{0}' and (qty_exception>0 or return_rate_exception>0) """ sqlCmd = sql.format(enddate, self.fdata, self.stock_daily_check_data) df = self.sparkSession.sql(sqlCmd).repartition(5) save_data(self.sparkSession, self.adata, self.stock_ac_check_exception_data, enddate, df)
def _cal_rank_raterank_ratio(self, preFinalDf, enddate, save_db, save_table, compute, dropParition=True): """ 计算收益排行,收益率排行,以及收益占比,逻辑互通多周期可通用 :param preFinalDf: :param enddate: :param save_db: :param save_table: :param compute: :param dropParition: :return: """ if preFinalDf is None: return # 在time weighted 情况下有可能出现return和return_rate 不同向 profitDf = self._get_profit_percent(preFinalDf, "return >= 0") notProfitDf = self._get_profit_percent(preFinalDf, "return < 0") # calc profit percent tmp1 = self._cal_return_ratio(preFinalDf, profitDf, " return>=0") tmp1.persist(StorageLevel.DISK_ONLY).count() tmp2 = self._cal_return_ratio(preFinalDf, notProfitDf, " return <0") tmp2.persist(StorageLevel.DISK_ONLY).count() finalDf1 = self._create_rank_temp_view(tmp1, "stock_cust_return_by_prd_tmp1", compute) finalDf2 = self._create_rank_temp_view(tmp2, "stock_cust_return_by_prd_tmp2", compute) dropPartitionSql = \ "alter table {0}.{1} drop if exists partition(busi_date= '{2}',compute='{3}')".format( save_db, save_table, enddate, compute) save_data(self.sparkSession, save_db, save_table, enddate, finalDf1.union(finalDf2).repartition(30), partitonByName=["busi_date", "compute"], dropPartitonSql=dropPartitionSql, defaultDropPartition=dropParition)
def daily_compute(self, startdate=None, enddate=None): """ daily_compute """ currentMonthDate = enddate[0:7] + '-01' print '>>>>get current month date between [{}] and [{}]'.format( currentMonthDate, enddate) currentMonthDataDf, currentDateUnCloseDf = self._cal_current_month_data( currentMonthDate, enddate) month = enddate[0:7] closeMonthDataDf = self._cal_data(currentMonthDataDf, enddate).withColumn( "month", fn.lit(month)) dropTablePartitionSql = "alter table {0}.{1} drop if exists partition(month='{2}')".format( self.adata, self.stock_close_tradebyday_by_month, month) save_data(self.sparkSession, self.adata, self.stock_close_tradebyday_by_month, enddate, closeMonthDataDf.repartition(20), partitonByName=["month"], dropPartitonSql=dropTablePartitionSql) startdate = str(get_natural_date(enddate, -360 + 1)) print '>>>>get last year date between [{}] and [{}]'.format( startdate, enddate) df = self._cal_current_data(startdate, enddate, currentDateUnCloseDf) save_data(self.sparkSession, self.adata, self.stock_tradebyday_data, enddate, df.repartition(20))
def daily_compute(self, startdate=None, enddate=None): """ daily_compute :param startdate: :param enddate: :return: """ # 月度计算,需要从一号开始 dfBase = self._get_base_data(startdate, enddate) # 计算long_return,short_return,total_return dfBaseRdd = dfBase.rdd.map(lambda row: _travel_row2(row, enddate)) if dfBaseRdd is None or dfBaseRdd.isEmpty(): print "dfBaseRdd empty,after _travel_row2 " return preFinalDF = self.sparkSession.createDataFrame(dfBaseRdd) tempFinalTable = "pre_final_month_table" preFinalDF.createOrReplaceTempView(tempFinalTable) profit_num = preFinalDF.where( preFinalDF.stock_long_return >= 0).count() all_num = preFinalDF.count() finalDf = self.sparkSession.sql(""" select *,rank() over(order by stock_total_return desc)/{2} stock_total_return_rank_percentage, {0} stock_profit_user_ratio from {1} """.format(profit_num * 1.0 / all_num, tempFinalTable, all_num)).repartition(20) # drop parition if exists dropPartitionSql = "alter table {0}.{1} drop if exists partition(month='{2}')".format( self.adata, self.stock_return_by_month, enddate[0:7]) save_data(self.sparkSession, self.adata, self.stock_return_by_month, enddate[0:7], finalDf, "month", dropPartitionSql) if self.logLevel != 'debug': self.sparkSession.catalog.dropTempView(tempFinalTable)
def daily_compute(self, startdate=None, enddate=None): dfPrd = self._get_all_num(enddate, self.adata, self.stock_cust_return_by_prd_by_month) dfMon = self._get_all_num(enddate, self.adata, self.stock_return_by_month) allMonNum = dfMon.first().all_user allPrdNum = dfPrd.first().all_user month = enddate[0:7] sqlTmp = """ select busi_date,month, sum(mon_exception_label) mon_exception_uv, cast(sum(mon_exception_label)*1.0/{4} as double) mon_exception_rate, sum(prd_exception_label) prd_exception_uv, cast(sum(prd_exception_label)*1.0/{5} as double) prd_exception_rate, sum(lr_equal_exception) lr_exception_uv, cast(sum(lr_equal_exception)*1.0/{5} as double) lr_exception_rate, sum(sr_equal_exception) sr_exception_uv, cast(sum(sr_equal_exception)*1.0/{5} as double) sr_exception_rate, sum(mon_pos_neg_exception) mon_pos_neg_exception_uv, cast(sum(mon_pos_neg_exception)*1.0/{4} as double) mon_pos_neg_exception_rate from {1}.{2} where busi_date= '{0}' and month='{3}' group by busi_date,month """.format(enddate, self.adata, self.stock_prd_mon_check_exception_data, month, allMonNum, allPrdNum) df = self.sparkSession.sql(sqlTmp).persist( StorageLevel.DISK_ONLY).repartition(5) save_data(self.sparkSession, self.adata, self.stock_prd_mon_check_exception_report, enddate, df)
def _stat_report(self, df, enddate): allCount = df.count() finalDf = df.groupBy("busi_date").agg( fn.sum("tbd_exception_label").alias("tbd_exception_uv"), fn.sum("o_exception_label").alias("o_exception_uv"), fn.sum( fn.when(df.o_exception_label != df.o_exception_label, 1).otherwise(0)).alias("o_tbd_exception_unequal_uv"), fn.sum("lr_equal_exception").alias("lr_equal_exception_uv"), fn.sum("lmv_euqal_exception").alias("lmv_equal_exception_uv"), fn.sum("sr_equal_exception").alias("sr_equal_exception_uv"), fn.sum("smv_equal_exception").alias("smv_equal_exception_uv"), (fn.sum("tbd_exception_label") / allCount).alias("tbd_exception_rate"), (fn.sum("o_exception_label") / allCount).alias("o_exception_rate"), (fn.sum( fn.when(df.o_exception_label != df.o_exception_label, 1).otherwise(0)) / allCount).alias("o_tbd_exception_unequal_rate"), (fn.sum("lr_equal_exception") / allCount).alias("lr_equal_exception_rate"), (fn.sum("lmv_euqal_exception") / allCount).alias("lmv_equal_exception_rate"), (fn.sum("sr_equal_exception") / allCount).alias("sr_equal_exception_rate"), (fn.sum("smv_equal_exception") / allCount).alias("smv_equal_exception_rate")) save_data(self.sparkSession, self.adata, self.taget_report_table, enddate, finalDf)
def save_rank_table(self, compute, enddate, df, table): dropTablePartitionSql = \ "alter table {0}.{1} drop if exists partition(busi_date= '{2}',compute_term='{3}')" \ .format(self.adata, table, enddate, compute) save_data(self.sparkSession, self.adata, table, enddate, df, partitonByName=["busi_date", "compute_term"], dropPartitonSql=dropTablePartitionSql)
def daily_compute(self, startdate=None, enddate=None): dfPrd = self._get_all_num(enddate, self.adata, self.stock_cust_return_by_prd) dfInd = self._get_all_num(enddate, self.adata, self.stock_cust_return_by_prd_ind) sqlTmp = """ select busi_date,compute_term, sum(prd_exception_label) prd_exception_uv, sum(ind_exception_label) ind_exception_uv, sum(re_equal_exception) re_equal_exception_uv, sum(prd_pos_neg_exception) prd_pos_neg_exception_uv, sum(ind_pos_neg_exception) ind_pos_neg_exception_uv from {1}.{2} where busi_date= '{0}' group by busi_date,compute_term """.format(enddate, self.adata, self.stock_prd_ind_check_exception_data) df = self.sparkSession.sql(sqlTmp) df = df.join(dfPrd, ["compute_term"], "inner")\ .select(df["*"], (df.prd_exception_uv/dfPrd.all_num).alias("prd_exception_rate"), (df.re_equal_exception_uv / dfPrd.all_num).alias("re_equal_exception_rate"), (df.prd_pos_neg_exception_uv / dfPrd.all_num) .alias("prd_pos_neg_exception_rate") )\ .persist(StorageLevel.DISK_ONLY) df = df.join(dfInd, ["compute_term"], "inner")\ .select(df["*"], (df.ind_exception_uv/dfInd.all_num).alias("ind_exception_rate"), (df.ind_pos_neg_exception_uv / dfInd.all_num) .alias("ind_pos_neg_exception_rate") )\ .persist(StorageLevel.DISK_ONLY).repartition(5) save_data(self.sparkSession, self.adata, self.stock_prd_ind_check_exception_report, enddate, df)
def daily_compute(self, startdate=None, enddate=None): sql = """ select trade_id,pre_asset_val,pre_debt_val,now_asset_val, now_debt_val,capital_in,capital_out, nvl(qty_exception, 0) exception_label, long_return,short_return,total_return, nvl(long_return/{3},0.0) long_return_rate, nvl(short_return/{3},0.0) short_return_rate, nvl(total_return/{3},0.0) total_return_rate, int_tax_in, int_tax_out, busi_date from ( select trade_id, sum(case when trd_type='long_related' then pre_mkt_val else 0 end ) pre_asset_val, sum(case when trd_type='short_related' then pre_mkt_val else 0 end ) pre_debt_val, sum(case when trd_type='long_related' then now_mkt_val else 0 end ) now_asset_val, sum(case when trd_type='short_related' then now_mkt_val else 0 end ) now_debt_val, sum(capital_in) capital_in, sum(capital_out) capital_out, max(qty_exception) qty_exception, max(return_rate_exception) return_rate_exception, sum(case when trd_type='long_related' and qty_exception=0 then return else 0 end ) long_return, sum(case when trd_type='short_related' and qty_exception=0 then return else 0 end ) short_return, sum(case when qty_exception=0 then return else 0 end) total_return, sum(nvl(int_tax_in,0)) int_tax_in, sum(nvl(int_tax_out,0)) int_tax_out, busi_date from {1}.{2} where busi_date >='{0}' and busi_date <='{4}' group by trade_id,busi_date ) a """ selectSql = sql.format( startdate, self.tdata, self.stock_daily_check_data, """ greatest(greatest(pre_asset_val+pre_debt_val,-1 * pre_debt_val), greatest(pre_asset_val+pre_debt_val,-1 * pre_debt_val) +capital_in+int_tax_in, greatest(pre_asset_val+pre_debt_val,-1 * pre_debt_val) +capital_in+capital_out+int_tax_in+int_tax_out) """, enddate) dfLong = self.sparkSession.sql(selectSql) self.batch_drop_partition(startdate, enddate, self.tdata, self.stock_cust_daily_return) save_data(self.sparkSession, self.tdata, self.stock_cust_daily_return, None, dfLong, defaultDropPartition=False)
def save_trade_close_data(self, data, spark, busi_date): tempTable = "temp_cctr_long" data = data.filter(lambda x: len(x["close_detail"]) > 0) if data.count() > 0: data.toDF().createOrReplaceTempView(tempTable) sqlCmd = """ select trade_id,secu_acc_id,prd_no, cd['open_date'] open_date, cd['close_type'] close_type, cd['close_date'] close_date, cast(cd['close_timestamp'] as bigint) close_timestamp, cd['busi_date'] busi_date, cast(cd['open_qty'] as bigint) open_qty, cast(cd['open_amt'] as double) open_amt, cast(cd['close_qty'] as bigint) close_qty, cast(cd['close_amt'] as double) close_amt, cast(cd['return'] as double) return, cast(cd['return_rate'] as double) return_rate, cast(cd['weighted_term'] as double) weighted_term, cast(cd['exception_label'] as bigint) exception_label from temp_cctr_long lateral view explode(close_detail) cd as cd """ findDf = spark.sql(sqlCmd).withColumn('busi_date', fn.lit(busi_date)).persist( StorageLevel.DISK_ONLY) save_data(spark, self.fdata, self.close_table, busi_date, findDf) spark.catalog.dropTempView(tempTable)
def _save_result(self, df, compute, enddate): dropPartitionSql = """alter table {0}.{1} drop if exists partition(busi_date= '{2}',compute_term='{3}')"""\ .format(self.adata, self.stock_cust_pl_analysis, enddate, compute) save_data(self.sparkSession, self.adata, self.stock_cust_pl_analysis, enddate, df.repartition(30), partitonByName=["busi_date", "compute_term"], dropPartitonSql=dropPartitionSql)
def daily_compute(self, startdate=None, enddate=None): df = self.sparkSession.sql(""" select trade_id,secu_acc_id,prd_no, concat_ws('-',collect_list(busi_flag_code)) busi_flag_code, concat_ws('-',collect_list(busi_flag_name)) busi_flag_name, sum(trd_qty) trd_qty, sum(case when trd_cash_flow >=0 then trd_qty else 0 end) pos_trd_qty, sum(case when trd_cash_flow <0 then trd_qty else 0 end) neg_trd_qty, sum(trd_cash_flow) trd_cash_flow, sum(case when trd_cash_flow >=0 then trd_cash_flow else 0 end) pos_cash_flow, sum(case when trd_cash_flow <0 then trd_cash_flow else 0 end) neg_cash_flow, sum(case when trd_capital_type='capital_in' then trd_cash_flow else 0 end) capital_in, sum(case when trd_capital_type='capital_out' then trd_cash_flow else 0 end) capital_out, sum(trd_cash_flow-trd_amt) trd_fee, sum(nvl(int_tax_in,0)) int_tax_in, sum(nvl(int_tax_out,0)) int_tax_out, max(cash_flow_modi_label) cash_flow_modi_label, trd_type, '{0}' busi_date from {1}.{2} where busi_date = '{0}' group by trade_id,secu_acc_id,prd_no,trd_type """.format(enddate, self.odata, self.conf['cash_flow_table'])) save_data(self.sparkSession, self.fdata, self.conf['stock_cash_flow_merge_table'], enddate, df)
def daily_compute(self, startdate=None, enddate=None): sqlTmp = """ select exception_pv,exception_uv, exception_pv/b.all_num exception_pv_rate, exception_uv/b.all_user exception_uv_rate, max_return,min_return, max_return_rate,min_return_rate, exception_type, busi_date from ( select busi_date, count(1) exception_pv, count(distinct trade_id) exception_uv, max(return) max_return, min(return) min_return, max(return_rate) max_return_rate, min(return_rate) min_return_rate, case when qty_exception =1 and return_rate_exception=0 then 'qty' when qty_exception =0 and return_rate_exception=1 then 'return_rate' else 'both' end exception_type from {1}.{2} where busi_date= '{0}' and (qty_exception<>0 or return_rate_exception<>0) group by case when qty_exception =1 and return_rate_exception=0 then 'qty' when qty_exception =0 and return_rate_exception=1 then 'return_rate' else 'both' end,busi_date ) a cross join ( select count(1) all_num,count(distinct trade_id) all_user from {1}.{2} where busi_date= '{0}' ) b """.format(enddate, self.fdata, self.conf['check_data_table']) df = self.sparkSession.sql(sqlTmp).repartition(5) save_data(self.sparkSession, self.adata, self.stock_ac_check_exception_report, enddate, df)
def _cal_detail(self, finalDf, enddate): finalDfData = finalDf.where(""" o_exception_label>0 or c_exception_label>0 or lr_equal_exception>0 or lmv_equal_exception>0 or sr_equal_exception>0 or smv_equal_exception>0 """) save_data(self.sparkSession, self.adata, self.taget_detail_table, enddate, finalDfData)
def save_unclose_cal(self, data, busi_date, filter_func, sort_func, unclose_cal_table): """ """ data = data.map(lambda x: filter_data(x, filter_func)) data = data.filter(lambda x: len(x["open_detail"]) > 0) data = data.map(lambda x: Row(**x)) if data.count() > 0: data = data.toDF() data = sort_func(data) save_data(self.sparkSession, self.fdata, unclose_cal_table, busi_date, data)
def save_unclose_data(self, data, busi_date, filter_func, sort_func): """ """ data = data.map(lambda x: filter_data(x, filter_func)) data = data.filter(lambda x: len(x["open_detail"]) > 0) if data.count() > 0: data = data.flatMap(self._trans_unclose).toDF() data = sort_func(data) save_data(self.spark, self.fdata, self.unclose_table, busi_date, data) else: pass
def daily_compute(self, startdate=None, enddate=None): month = enddate[0:7] sqlPrd = """ select busi_date,trade_id,month, sum(short_return) short_return, sum(long_return) long_return, max(exception_label) prd_exception_label from {1}.{2} where month='{0}' and prd_no!='0.0' and trade_id='11305' group by trade_id,month,busi_date """.format(month, self.adata, self.stock_cust_return_by_prd_by_month) dfPrd = self.sparkSession.sql(sqlPrd)\ .persist(StorageLevel.DISK_ONLY) dfPrd.registerTempTable("temp_return_prd_month_1") sqlMon = """ select trade_id,month, sum(stock_short_return) short_return, sum(stock_long_return) long_return, max(case when stock_total_return>=0 and stock_total_return_rate>=0 then 0 when stock_total_return<0 and stock_total_return_rate<0 then 0 else 1 end) mon_pos_neg_exception, max(exception_label) mon_exception_label from {1}.{2} where month='{0}' and trade_id='11305' group by trade_id,month """.format(month, self.adata, self.stock_return_by_month) dfMon = self.sparkSession.sql(sqlMon)\ .persist(StorageLevel.DISK_ONLY) dfMon.registerTempTable("temp_return_prd_1") sql = """ select nvl(a.trade_id,b.trade_id) trade_id, nvl(a.month,b.month) month, nvl(mon_pos_neg_exception,0) mon_pos_neg_exception, nvl(prd_exception_label,0) prd_exception_label, nvl(mon_exception_label,0) mon_exception_label, case when abs(nvl(a.short_return,0)-nvl(b.short_return,0)) <= 0.01 then 0 else 1 end sr_equal_exception, case when abs(nvl(a.long_return,0)-nvl(b.long_return,0)) <= 0.01 then 0 else 1 end lr_equal_exception from temp_return_prd_month_1 a full outer join temp_return_prd_1 b on a.trade_id=b.trade_id and a.month=b.month """ df = self.sparkSession.sql(sql).persist(StorageLevel.DISK_ONLY)\ .where(""" prd_exception_label>0 or mon_pos_neg_exception>0 or mon_exception_label>0 or sr_equal_exception>0 or lr_equal_exception>0 """)\ .withColumn("busi_date", fn.lit(enddate)).repartition(5) save_data(self.sparkSession, self.adata, self.stock_prd_mon_check_exception_data, enddate, df)
def save_trade_unclose_data(self, data, spark, busi_date, filter_func): data = data.map(lambda x: filter_data(x, filter_func)) data = data.filter(lambda x: len(x["open_detail"]) > 0) if data.count() > 0: dataCal = data.toDF().persist(StorageLevel.DISK_ONLY).repartition(20) save_data(spark, self.fdata, self.unclose_cal, busi_date, dataCal) data = data.flatMap(self._trans_unclose)\ .toDF().persist(StorageLevel.DISK_ONLY).repartition(20) data = data.select("trade_id", "secu_acc_id", "prd_no", "open_date", "open_timestamp", "open_type", "busi_date", "orig_trd_qty", "orig_trd_amt", "trd_qty", "trd_amt", "close_qty", "close_amt", "unclose_qty", "unclose_amt", "return", "return_rate", "weighted_term", "exception_label") save_data(spark, self.fdata, self.unclose_table, busi_date, data)
def _cal_final_result(self, dfBase, averageDf, compute, enddate): prefinalDf = self._return_rate_rank_perc(self.sparkSession, dfBase, averageDf, compute, enddate) prefinalDf.persist(StorageLevel.DISK_ONLY).count() dropPartitionSql = """alter table {0}.{1} drop if exists partition(busi_date= '{2}',compute_term='{3}')"""\ .format(self.adata, self.stock_cust_investment_rank_score, enddate, compute) save_data(self.sparkSession, self.adata, self.stock_cust_investment_rank_score, enddate, prefinalDf, partitonByName=["busi_date", "compute_term"], dropPartitonSql=dropPartitionSql)
def _stat_report(self, df, enddate): allCount = df.count() finalDf = df.groupBy(["trade_id", "busi_date", "compute_term"]).agg( fn.sum("rr_sum_exception").alias("rr_sum_exception_uv"), fn.sum("pl_trd_sum_exception").alias("pl_trd_sum_exception_uv"), fn.sum("pl_r_sum_exception").alias("pl_r_sum_exception_uv"), (fn.sum("rr_sum_exception") / allCount).alias("rr_sum_exception_rate"), (fn.sum("pl_trd_sum_exception") / allCount).alias("pl_trd_sum_exception_rate"), (fn.sum("pl_r_sum_exception") / allCount).alias("pl_t_sum_exception_rate")) save_data(self.sparkSession, self.adata, self.taget_report_table, enddate, finalDf)
def _cal_final_result(self, dfBaseRdd, compute, enddate, dropParition): prefinalDf = self.sparkSession.createDataFrame(dfBaseRdd).withColumn( "compute_term", F.lit(compute)) prefinalDf.persist(StorageLevel.DISK_ONLY).count() dropPartitionSql = "alter table {0}.{1} drop " \ "if exists partition(busi_date= '{2}',compute_term='{3}')"\ .format(self.adata, self.stock_cust_investment_ability, enddate, compute) save_data(self.sparkSession, self.adata, self.stock_cust_investment_ability, enddate, prefinalDf, partitonByName=["busi_date", "compute_term"], dropPartitonSql=dropPartitionSql, defaultDropPartition=dropParition)
def daily_compute(self, startdate=None, enddate=None): sqlPrd = """ select trade_id,prd_ind, busi_date,compute compute_term, max(prd_name) prd_name, sum(return) prd_return, max(case when return>=0 and return_rate>=0 then 0 when return<0 and return_rate<0 then 0 else 1 end) prd_pos_neg_exception, max(exception_label) prd_exception_label from {1}.{2} where busi_date='{0}' and prd_no!='0.0' group by trade_id,prd_ind,compute,busi_date """.format(enddate, self.adata, self.stock_cust_return_by_prd) dfPrd = self.sparkSession.sql(sqlPrd) sqlInd = """ select trade_id b_trade_id,prd_ind b_prd_ind, compute b_compute_term, sum(return) ind_return, max(case when return>=0 and return_rate>=0 then 0 when return<0 and return_rate<0 then 0 else 1 end) ind_pos_neg_exception, max(exception_label) ind_exception_label from {1}.{2} where busi_date='{0}' group by trade_id,prd_ind,compute """.format(enddate, self.adata, self.stock_cust_return_by_prd_ind) dfInd = self.sparkSession.sql(sqlInd) cond = [ dfPrd.trade_id == dfInd.b_trade_id, dfPrd.prd_ind == dfInd.b_prd_ind, dfPrd.compute_term == dfInd.b_compute_term ] df = dfPrd.join(dfInd, cond, "full_outer")\ .select(dfPrd.busi_date, dfPrd.trade_id, dfPrd.prd_ind, dfPrd.prd_name, dfPrd.compute_term, dfPrd.prd_pos_neg_exception, dfInd.ind_pos_neg_exception, dfPrd.prd_exception_label, dfInd.ind_exception_label, fn.when(dfPrd.prd_return-dfInd.ind_return <= 0.01, 0) .otherwise(1).alias("re_equal_exception") )\ .persist(StorageLevel.DISK_ONLY) df = df.where(""" prd_pos_neg_exception>0 or ind_pos_neg_exception>0 or prd_exception_label>0 or ind_exception_label>0 or re_equal_exception>0 """).repartition(5) save_data(self.sparkSession, self.adata, self.stock_prd_ind_check_exception_data, enddate, df)
def daily_compute(self, startdate=None, enddate=None): asset_debt_table = self._cal_asset_debt(startdate, enddate) # 需要获取计算日前一天的数据 capital_in_out_table = self._cal_capital_in_out( asset_debt_table, startdate, enddate) cust_daily_return_inv_table = self._cal_return_inv( capital_in_out_table, startdate, enddate) finalDf = self._cal_final_res(cust_daily_return_inv_table, startdate, enddate) self.batch_drop_partition(startdate, enddate, self.tdata, self.asset_cust_daily_return) save_data(self.sparkSession, self.tdata, self.asset_cust_daily_return, None, finalDf, defaultDropPartition=False)
def daily_compute(self, startdate=None, enddate=None): sqlTmp = """ select exception_pv,exception_uv, nvl(exception_pv/b.all_num,0.0) exception_pv_rate, nvl(exception_uv/b.all_user,0.0) exception_uv_rate, max_return,min_return, max_return_rate,min_return_rate, exception_type, a.busi_date from ( select count(1) exception_pv, count(distinct trade_id) exception_uv, max(return) max_return, min(return) min_return, max(return_rate) max_return_rate, min(return_rate) min_return_rate, case when qty_exception =1 and return_rate_exception=0 then 'qty' when qty_exception =0 and return_rate_exception=1 then 'return_rate' else 'both' end exception_type, busi_date from {2}.{3} where busi_date>= '{0}' and busi_date<='{1}' and (qty_exception<>0 or return_rate_exception<>0) group by case when qty_exception =1 and return_rate_exception=0 then 'qty' when qty_exception =0 and return_rate_exception=1 then 'return_rate' else 'both' end,busi_date ) a inner join ( select busi_date,count(1) all_num,count(distinct trade_id) all_user from {2}.{3} where busi_date>= '{0}' and busi_date<='{1}' GROUP BY busi_date ) b on a.busi_date=b.busi_date """.format(startdate, enddate, self.tdata, self.conf['check_data_table']) df = self.sparkSession.sql(sqlTmp).persist(StorageLevel.DISK_ONLY) self.batch_drop_partition(startdate, enddate, self.tdata, self.target_table) save_data(self.sparkSession, self.tdata, self.target_table, None, df, defaultDropPartition=False)
def daily_compute(self, startdate=None, enddate=None): """ daily_compute """ dfBase = self._get_base_info(startdate, enddate) dfBase.persist(StorageLevel.DISK_ONLY).count() dfMarketValue = self._get_market_value(startdate, enddate) dfMarketValue.persist(StorageLevel.DISK_ONLY).count() dfBase = dfBase\ .withColumn("market_chg_3d", F.lit(dfMarketValue.first().re_p3d))\ .withColumn("market_chg_5d", F.lit(dfMarketValue.first().re_p5d))\ .withColumn("market_chg_10d", F.lit(dfMarketValue.first().re_p10d)) dfIndustryPrdNo = self._get_industry_prd_no(startdate, enddate) dfIndustryPrdNo.persist(StorageLevel.DISK_ONLY).count() finalDf = self._merge_market_value(dfBase, dfIndustryPrdNo).repartition(20) save_data(self.sparkSession, self.adata, self.stock_cust_trd_quant, enddate, finalDf)
def daily_compute(self, startdate=None, enddate=None): self.get_base(startdate, enddate) checkSqlTmp = """ select ft.trade_id,ft.secu_acc_id,ft.prd_no, ft.pre_qty, ft.trd_qty, ft.now_qty, ft.pre_mkt_val, ft.now_mkt_val, ft.trd_cash_flow, ft.pos_cash_flow, ft.neg_cash_flow, capital_in, capital_out, ft.busi_flag_code, ft.int_tax_in, ft.int_tax_out, case when ft.prd_no='0.0' then 0 else ft.return end return, case when ft.prd_no='0.0' then 0 else ft.return_rate end return_rate, case when ft.prd_no='0.0' then 0 else ft.qty_exception end qty_exception, case when ft.prd_no='0.0' or rg.prd_no is null or (ft.return_rate>{2} and ft.return_rate<{3}) then 0 else 1 end return_rate_exception, ft.trd_type, ft.busi_date from {5} ft left outer join ( select * from {4} where busi_date >= '{0}' and busi_date<='{1}' ) rg on ft.prd_no=rg.prd_no and ft.busi_date=rg.busi_date """ longCheckSql = checkSqlTmp.format( startdate, enddate, "long_lower_limit", "long_upper_limit", "{0}.{1}".format(self.fdata, self.return_range), self.longTempTable) dfLong = self.sparkSession.sql(longCheckSql).persist(StorageLevel.DISK_ONLY) shortCheckSql = checkSqlTmp.format( startdate, enddate, "short_lower_limit", "short_upper_limit", "{0}.{1}".format(self.fdata, self.return_range), self.shortTempTable) dfShort = self.sparkSession.sql(shortCheckSql).persist(StorageLevel.DISK_ONLY) self.batch_drop_partition(startdate, enddate, self.tdata, self.check_data) save_data(self.sparkSession, self.tdata, self.check_data, None,dfLong, defaultDropPartition=False) save_data(self.sparkSession, self.tdata, self.check_data, None, dfShort, defaultDropPartition=False)
def daily_compute(self, startdate=None, enddate=None): startdate = get_date(self.date_order, self.order_date, enddate, -19) sql = """ select busi_date,trade_id,long_return,short_return,total_return, nvl(total_return_rate,0.0) total_return_rate, exception_label ac_exception_label, case when total_return>=0 and total_return_rate>=0 then 0 when total_return<0 and total_return_rate<0 then 0 else 1 end pos_neg_exception, case when abs(long_return+short_return-total_return)<0.01 then 0 else 1 end detail_sum_exception, case when abs(total_return_rate)<0.223 then 0 else 1 end rr_outlier_exception from {1}.{2} where busi_date='{0}' """ sqlCmd = sql.format(enddate, self.adata, self.stock_cust_daily_return) df = self.sparkSession.sql(sqlCmd) sqllast20 = """ select trade_id, avg(total_return_rate)+3*stddev(total_return_rate) std_return_rate, count(busi_date) num_data from {2}.{3} where busi_date>='{0}' and busi_date<='{1}' group by trade_id """.format(startdate, enddate, self.adata, self.stock_cust_daily_return) dflast20 = self.sparkSession.sql(sqllast20) df = df.join(dflast20, "trade_id", "inner")\ .select(df["*"], fn.when((dflast20.num_data == 20) & (dflast20.std_return_rate <= df.total_return_rate), 0) .when(dflast20.num_data < 20, 0) .when(dflast20.std_return_rate == 0, 0) .otherwise(1).alias("rr_sp_exception"))\ .where(""" ac_exception_label!=0 or pos_neg_exception!=0 or detail_sum_exception!=0 or rr_outlier_exception!=0 or rr_sp_exception!=0 """)\ .persist(StorageLevel.DISK_ONLY).repartition(5) save_data(self.sparkSession, self.adata, self.stock_dr_check_exception_data, enddate, df)