def get_new_stock_info(self, xnms, xinx): new_stock_data = bt.AZ_Load_csv(f'{self.root_path}/EM_Funda/CDSY_SECUCODE/LISTSTATE.csv') new_stock_data.fillna(method='ffill', inplace=True) # 获取交易日信息 return_df = bt.AZ_Load_csv(f'{self.root_path}/EM_Funda/DERIVED_14/aadj_r.csv').astype(float) trade_time = return_df.index new_stock_data = new_stock_data.reindex(index=trade_time).fillna(method='ffill') target_df = new_stock_data.shift(40).notnull().astype(int) target_df = target_df.reindex(columns=xnms, index=xinx) return target_df
def load_vsMCap_factor(self, file_name): load_path = '/mnt/mfs/DAT_EQT/EM_Funda/daily/' raw_df = bt.AZ_Load_csv(os.path.join(load_path, file_name + '.csv')) \ .reindex(index=self.xinx, columns=self.xnms) mcap_df = bt.AZ_Load_csv('/mnt/mfs/DAT_EQT/EM_Funda/LICO_YS_STOCKVALUE/AmarketCapExStri.csv') \ .reindex(index=self.xinx, columns=self.xnms) mcap_df_ma = bt.AZ_Rolling_mean(mcap_df.replace(0, np.nan), 60) tmp_df = raw_df / mcap_df_ma target_df = self.row_extre(tmp_df, self.sector_df, 0.3) return target_df
def load_change_factor(self, file_name): load_path = f'{self.root_path}/EM_Funda/daily/' raw_df = bt.AZ_Load_csv(os.path.join(load_path, file_name + '.csv')) \ .reindex(index=self.xinx, columns=self.xnms) QTTM_df = bt.AZ_Load_csv(os.path.join(load_path, '_'.join(file_name.split('_')[:-1]) + '_QTTM.csv')) \ .reindex(index=self.xinx, columns=self.xnms) QTTM_df_ma = bt.AZ_Rolling_mean(QTTM_df.abs().replace(0, np.nan), 60) tmp_df = raw_df / QTTM_df_ma # target_df = bt.AZ_Row_zscore(tmp_df) target_df = self.row_extre(tmp_df, self.sector_df, 0.2) return target_df
def load_locked_data(self): raw_suspendday_df = bt.AZ_Load_csv( os.path.join(self.root_path, 'EM_Funda/TRAD_TD_SUSPENDDAY/SUSPENDREASON.csv')) suspendday_df = raw_suspendday_df.isnull().astype(int) suspendday_df = suspendday_df.reindex(columns=self.xnms, index=self.xinx, fill_value=True) suspendday_df.replace(0, np.nan, inplace=True) return_df = bt.AZ_Load_csv(os.path.join(self.root_path, 'EM_Funda/DERIVED_14/aadj_r.csv')).astype(float) limit_buy_sell_df = (return_df.abs() < 0.095).astype(int) limit_buy_sell_df = limit_buy_sell_df.reindex(columns=self.xnms, index=self.xinx, fill_value=1) limit_buy_sell_df.replace(0, np.nan, inplace=True) return suspendday_df, limit_buy_sell_df
def load_index_weight_data(self, index_name): index_info = bt.AZ_Load_csv(self.root_path + f'/EM_Funda/IDEX_YS_WEIGHT_A/SECURITYNAME_{index_name}.csv') index_info = self.reindex_fun(index_info) index_mask = (index_info.notnull() * 1).replace(0, np.nan) mkt_cap = bt.AZ_Load_csv(os.path.join(self.root_path, 'EM_Funda/LICO_YS_STOCKVALUE/AmarketCapExStri.csv')) mkt_roll = mkt_cap.rolling(250, min_periods=0).mean() mkt_roll = self.reindex_fun(mkt_roll) mkt_roll_qrt = np.sqrt(mkt_roll) mkt_roll_qrt_index = mkt_roll_qrt * index_mask index_weight = mkt_roll_qrt_index.div(mkt_roll_qrt_index.sum(axis=1), axis=0) return index_weight
def rzrq_create_factor(index_root_path, sector_df): # 融资融券数据 rzrq_root_path = '/mnt/mfs/DAT_EQT/EM_Funda/TRAD_MT_MARGIN' name_list = [ 'RZRQYE', 'RZMRE', 'RZYE', 'RQMCL', 'RQYE', 'RQYL', 'RQCHL', 'RZCHE' ] # 均值 rolling_mean_list = [5, 10, 20, 60] limit_list = [1, 1.5, 2] updn_list = [3, 4, 5] # 单个数据 简单的z-score for tab_name in name_list: print(tab_name) data = bt.AZ_Load_csv(os.path.join(rzrq_root_path, tab_name + '.csv')) data = data.reindex(index=sector_df.index, columns=sector_df.columns) data.replace(0, np.nan, inplace=True) pnd_roll_mean_row_extre_fun(tab_name, data, rolling_mean_list, limit_list, index_root_path, sector_df) pnd_col_extre_fun(tab_name, data, rolling_mean_list, limit_list, index_root_path, sector_df) pnd_row_extre_fun(tab_name, data, limit_list, index_root_path, sector_df) pnd_continue_up_dn_fun(tab_name, data, updn_list, index_root_path, sector_df) data_p5d_chg = data.div(data.shift(5), fill_value=0) - 1 data_p5d_chg.replace(np.inf, 0, inplace=True) pnd_col_extre_fun(tab_name + '_chg5', data_p5d_chg, rolling_mean_list, limit_list, index_root_path, sector_df) pnd_row_extre_fun(tab_name + '_chg5', data_p5d_chg, limit_list, index_root_path, sector_df) pnd_continue_up_dn_fun(tab_name + '_chg5', data_p5d_chg, updn_list, index_root_path, sector_df)
def load_ratio_factor(self, file_name): load_path = '/mnt/mfs/DAT_EQT/EM_Funda/daily/' tmp_df = bt.AZ_Load_csv(os.path.join(load_path, file_name + '.csv')) \ .reindex(index=self.xinx, columns=self.xnms) # target_df = bt.AZ_Row_zscore(tmp_df) target_df = self.row_extre(tmp_df, self.sector_df, 0.3) return target_df
def load_raw_data(root_path, raw_data_path): raw_data_list = [] for target_path in raw_data_path: tmp_data = bt.AZ_Load_csv( os.path.join(root_path.str(), target_path + '.csv')) raw_data_list += tmp_data return raw_data_list
def create_sector(root_path, name_list, sector_name, begin_date): market_top_n = bt.AZ_Load_csv(os.path.join(root_path, 'EM_Funda/DERIVED_10/' + sector_name + '.csv')) market_top_n = market_top_n[(market_top_n.index >= begin_date)] sum_df = pd.DataFrame() for n in name_list: tmp_df = bt.AZ_Load_csv('/mnt/mfs/DAT_EQT/EM_Funda/LICO_IM_INCHG/Global_Level1_{}.csv'.format(n)) tmp_df = tmp_df[(tmp_df.index >= begin_date)] sum_df = sum_df.add(tmp_df, fill_value=0) if sum_df[sum_df > 1].sum().sum() != 0: print('error', name_list) else: market_top_n_sector = market_top_n.mul(sum_df) market_top_n_sector.dropna(how='all', axis='columns', inplace=True) market_top_n_sector.to_csv('/mnt/mfs/dat_whs/data/sector_data/{}_industry_{}.csv' .format(sector_name, '_'.join([str(x) for x in name_list])), sep='|')
def get_st_stock_info(self, xnms, xinx): data = bt.AZ_Load_csv(os.path.join(self.root_path, 'EM_Funda/CDSY_CHANGEINFO/CHANGEA.csv')) data = data.reindex(columns=xnms, index=xinx) data.fillna(method='ffill', inplace=True) data = data.astype(str) target_df = data.applymap(lambda x: 0 if 'ST' in x or 'PT' in x else 1) return target_df
def load_notice_factor(self, file_name): load_path = '/mnt/mfs/dat_whs/EM_Funda/my_data_test' tmp_df = bt.AZ_Load_csv(os.path.join(load_path, file_name + '.csv')) \ .reindex(index=self.xinx, columns=self.xnms) target_df = self.row_extre(tmp_df, self.sector_df, 0.3) if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def load_index_data(xinx, index_name): # root_index_path = os.path.join(root_path, 'data/index_data') # target_df = pd.read_pickle(os.path.join(root_index_path, index_name + '.pkl')) # target_df = target_df[target_df.columns[0]].reindex(index=xinx) data = bt.AZ_Load_csv( '/mnt/mfs/DAT_EQT/EM_Tab09/INDEX_TD_DAILYSYS/CHG.csv') target_df = data[index_name].reindex(index=xinx) return target_df * 0.01
def load_sector_data(self): if self.sector_name.startswith('index'): index_name = self.sector_name.split('_')[-1] market_top_n = bt.AZ_Load_csv(f'{self.root_path}/EM_Funda/IDEX_YS_WEIGHT_A/SECURITYNAME_{index_name}.csv') market_top_n[market_top_n == market_top_n] = 1 else: market_top_n = bt.AZ_Load_csv(f'{self.root_path}/EM_Funda/DERIVED_10/{self.sector_name}.csv') market_top_n = market_top_n.reindex(index=self.xinx) market_top_n.dropna(how='all', axis='columns', inplace=True) xnms = market_top_n.columns xinx = market_top_n.index new_stock_df = self.get_new_stock_info(xnms, xinx) st_stock_df = self.get_st_stock_info(xnms, xinx) sector_df = market_top_n * new_stock_df * st_stock_df sector_df.replace(0, np.nan, inplace=True) return sector_df
def load_whs_factor(self, file_name): load_path = f'{self.root_path}/EM_Funda/dat_whs/' tmp_df = bt.AZ_Load_csv(os.path.join(load_path, file_name + '.csv')) # if self.xinx[-1] not in tmp_df.index: # send_email.send_email(file_name + self.sector_name, ['*****@*****.**'], [], '[LOADDATA]error') tmp_df = tmp_df.reindex(index=self.xinx, columns=self.xnms) target_df = self.row_extre(tmp_df, self.sector_df, 0.3) if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def load_jerry_factor(self, file_name): factor_path = '/mnt/mfs/temp/dat_jerry/signal' raw_df = bt.AZ_Load_csv(f'{factor_path}/{file_name}') a = list(set(raw_df.iloc[-1, :100].dropna().values)) tmp_df = raw_df.reindex(index=self.xinx, columns=self.xnms) if len(a) > 5: target_df = self.row_extre(tmp_df, self.sector_df, 0.3) else: target_df = tmp_df if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def load_ic_if_diff(self, file_name): data = bt.AZ_Load_csv(os.path.join('/mnt/mfs/DAT_EQT/EM_Funda/INDEX_TD_DAILYSYS/CHG.csv')) hs300 = data['000300'].reindex(index=self.xinx) * 0.01 zz500 = data['000905'].reindex(index=self.xinx) * 0.01 hs300_mean = bt.AZ_Rolling_mean(hs300, 10, 0) zz500_mean = bt.AZ_Rolling_mean(zz500, 10, 0) tmp_df = hs300_mean - zz500_mean ic_if_martix = tmp_df > 0 target_df = pd.DataFrame(np.array([ic_if_martix.values.ravel()] * len(self.xnms)).T, index=self.xinx, columns=self.xnms) return target_df
def load_whs_factor(self, file_name): load_path = f'{self.root_path}/EM_Funda/dat_whs' raw_df = bt.AZ_Load_csv(f'{load_path}/{file_name}.csv') a = list(set(raw_df.iloc[-1, :100].dropna().values)) tmp_df = raw_df.reindex(index=self.xinx, columns=self.xnms) if len(a) > 5: target_df = self.row_extre(tmp_df, self.sector_df, 0.3) else: target_df = tmp_df pass if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def load_sector_data(self): market_top_n = bt.AZ_Load_csv(os.path.join(self.root_path, 'EM_Funda/DERIVED_10/' + self.sector_name + '.csv')) market_top_n = market_top_n[(market_top_n.index >= self.begin_date) & (market_top_n.index < self.end_date)] market_top_n.dropna(how='all', axis='columns', inplace=True) xnms = market_top_n.columns xinx = market_top_n.index new_stock_df = self.get_new_stock_info(xnms, xinx) st_stock_df = self.get_st_stock_info(xnms, xinx) sector_df = market_top_n * new_stock_df * st_stock_df sector_df.replace(0, np.nan, inplace=True) return sector_df
def load_remy_factor(self, file_name): load_path = '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_F1' raw_df = bt.AZ_Load_csv(f'{load_path}/{file_name}') a = list(set(raw_df.iloc[-1, :100].dropna().values)) tmp_df = raw_df.reindex(index=self.xinx, columns=self.xnms) if len(a) > 5: target_df = self.row_extre(tmp_df, self.sector_df, 0.3) else: target_df = tmp_df pass if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def __init__(self, sector_name): begin_date = pd.to_datetime('20050505') end_date = datetime.now() # sector_name = 'market_top_2000' self.sector_name = sector_name market_top_n = bt.AZ_Load_csv( os.path.join('/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_10/' + sector_name + '.csv')) market_top_n = market_top_n[(market_top_n.index >= begin_date) & (market_top_n.index < end_date)] self.sector_df = market_top_n xinx = self.sector_df.index xnms = self.sector_df.columns aadj_r = bt.AZ_Load_csv( '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_14/aadj_r.csv') self.aadj_r = aadj_r.reindex(index=xinx, columns=xnms) aadj_p = bt.AZ_Load_csv( '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_14/aadj_p.csv') self.aadj_p = aadj_p.reindex(index=xinx, columns=xnms) self.save_path = '/mnt/mfs/dat_whs/data/sector_data'
def industry(self, file_list): # self.sector_df industry_df_sum = pd.DataFrame() for file_name in file_list: industry_df = bt.AZ_Load_csv( f'/mnt/mfs/DAT_EQT/EM_Funda/LICO_IM_INCHG/Global_Level1_{file_name}.csv' ) industry_df_sum = industry_df_sum.add(industry_df, fill_value=0) industry_df_sum = self.sector_df.mul(industry_df_sum, fill_value=0).replace(0, np.nan)\ .dropna(how='all', axis='columns') industry_df_sum.to_csv( '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_10/{}_industry_{}.csv'.format( self.sector_name, '_'.join([str(x) for x in file_list]))) return industry_df_sum
def load_sector_data(self): market_top_n = bt.AZ_Load_csv( f'/mnt/mfs/dat_whs/data/sector_data/{self.sector_name}.csv') # print(market_top_n.iloc[-1].dropna()) market_top_n = market_top_n[(market_top_n.index >= self.begin_date) & (market_top_n.index < self.end_date)] market_top_n.dropna(how='all', axis='columns', inplace=True) xnms = market_top_n.columns xinx = market_top_n.index new_stock_df = self.get_new_stock_info(xnms, xinx) st_stock_df = self.get_st_stock_info(xnms, xinx) sector_df = market_top_n * new_stock_df * st_stock_df sector_df.replace(0, np.nan, inplace=True) return sector_df
def load_sector_data(begin_date, end_date, sector_name): market_top_n = bt.AZ_Load_csv( os.path.join(stock_data_path, 'EM_Funda/DERIVED_10/' + sector_name + '.csv')) market_top_n = market_top_n.shift(1)[(market_top_n.index >= begin_date) & (market_top_n.index < end_date)] market_top_n = market_top_n[market_top_n.index >= begin_date] market_top_n.dropna(how='all', axis='columns', inplace=True) xnms = market_top_n.columns xinx = market_top_n.index new_stock_df = get_new_stock_info(xnms, xinx).shift(1) st_stock_df = get_st_stock_info(xnms, xinx).shift(2) sector_df = market_top_n * new_stock_df * st_stock_df sector_df.replace(0, np.nan, inplace=True) return sector_df
def load_remy_factor(self, file_name): load_path = f'{self.root_path}/EM_Funda/DERIVED_F1' raw_df = bt.AZ_Load_csv(f'{load_path}/{file_name}') a = list(set(raw_df.iloc[-1, :100].dropna().values)) if self.xinx[-1] not in raw_df.index: send_email.send_email(file_name + self.sector_name, ['*****@*****.**'], [], '[LOADDATA]error') tmp_df = raw_df.reindex(index=self.xinx, columns=self.xnms) if len(a) > 5: target_df = self.row_extre(tmp_df, self.sector_df, 0.3) else: target_df = tmp_df pass if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def load_remy_factor(self, file_name, sector_name): if sector_name.startswith('market_top_300plus'): factor_path = '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_F3/T300P' elif sector_name.startswith('market_top_300to800plus'): factor_path = '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_F3/T500P' else: factor_path = '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_F3/T500P' raw_df = bt.AZ_Load_csv(f'{factor_path}/{file_name}') a = list(set(raw_df.iloc[-1, :100].dropna().values)) tmp_df = raw_df.reindex(index=self.xinx, columns=self.xnms) if len(a) > 5: target_df = self.row_extre(tmp_df, self.sector_df, 0.3) else: target_df = tmp_df pass if self.if_only_long: target_df = target_df[target_df > 0] return target_df
def __init__(self, root_path, if_save, if_new_program, begin_date, cut_date, end_date, time_para_dict, sector_name, index_name, hold_time, lag, return_file, if_hedge, if_only_long): self.root_path = root_path self.if_save = if_save self.if_new_program = if_new_program self.begin_date = begin_date self.cut_date = cut_date self.end_date = end_date self.time_para_dict = time_para_dict self.sector_name = sector_name self.index_name = index_name self.hold_time = hold_time self.lag = lag self.return_file = return_file self.if_hedge = if_hedge self.if_only_long = if_only_long self.sector_df = self.load_sector_data() print('Loaded sector DataFrame!') self.xnms = self.sector_df.columns self.xinx = self.sector_df.index return_choose = bt.AZ_Load_csv(os.path.join(root_path, 'EM_Funda/DERIVED_14/aadj_r.csv')) self.return_choose = return_choose.reindex(index=self.xinx, columns=self.xnms) print('Loaded return DataFrame!') suspendday_df, limit_buy_sell_df = self.load_locked_data() limit_buy_sell_df_c = limit_buy_sell_df.shift(-1) limit_buy_sell_df_c.iloc[-1] = 1 suspendday_df_c = suspendday_df.shift(-1) suspendday_df_c.iloc[-1] = 1 self.suspendday_df_c = suspendday_df_c self.limit_buy_sell_df_c = limit_buy_sell_df_c print('Loaded suspendday_df and limit_buy_sell DataFrame!') self.index_df = self.load_index_data() print('Loaded index DataFrame!')
# self.sector_df industry_df_sum = pd.DataFrame() for file_name in file_list: industry_df = bt.AZ_Load_csv( f'/mnt/mfs/DAT_EQT/EM_Funda/LICO_IM_INCHG/Global_Level1_{file_name}.csv' ) industry_df_sum = industry_df_sum.add(industry_df, fill_value=0) industry_df_sum = self.sector_df.mul(industry_df_sum, fill_value=0).replace(0, np.nan)\ .dropna(how='all', axis='columns') industry_df_sum.to_csv( '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_10/{}_industry_{}.csv'.format( self.sector_name, '_'.join([str(x) for x in file_list]))) return industry_df_sum if __name__ == '__main__': sector_split = SectorSplit('market_top_2000') for file_list in [[10, 15], [20, 25, 30, 35], [40], [45, 50], [55]]: industry_df_sum = sector_split.industry(file_list) market_top_n = bt.AZ_Load_csv( '/mnt/mfs/dat_whs/data/sector_data/market_top_2000_industry_{}.csv' .format('_'.join([str(x) for x in file_list]))) a = industry_df_sum.loc[pd.to_datetime('20180829')].dropna() b = market_top_n.loc[pd.to_datetime('20180829')].dropna() # print((industry_df_sum.loc[pd.to_datetime('20100829'):pd.to_datetime('20180829')] # != market_top_n.loc[pd.to_datetime('20100829'):pd.to_datetime('20180829')]).sum().sum()) # print((industry_df_sum > 1).sum().sum()) # print((industry_df_sum == 0).sum().sum()) # print((market_top_n > 1).sum().sum()) # print((market_top_n == 0).sum().sum())
def load_index_data(self, index_name): data = bt.AZ_Load_csv( os.path.join(self.root_path, 'EM_Funda/INDEX_TD_DAILYSYS/CHG.csv')) target_df = data[index_name].reindex(index=self.xinx) return target_df * 0.01
def load_return_data(self): return_choose = bt.AZ_Load_csv( os.path.join(self.root_path, 'EM_Funda/DERIVED_14/aadj_r.csv')) return_choose = return_choose[(return_choose.index >= self.begin_date) & (return_choose.index < self.end_date)] return return_choose
# intra_data = intra_data[sorted(intra_data.columns)] # limit_list = [1, 1.5, 2] # para_list = [10, 20, 60] # # pnd_row_extre_fun(tab_name, intra_data, limit_list, index_root_path) # pnd_col_extre_fun(tab_name, intra_data, para_list, limit_list, factor_save_path, sector_df) #################################################################################################################### if __name__ == '__main__': sector_data_path = '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_10' base_data_path = '/mnt/mfs/DAT_EQT/EM_Tab14/TRAD_SK_DAILY_JC' factor_save_path = '/media/hdd1/dat_whs/data/new_factor_data' EQA_open = bt.AZ_Load_csv(os.path.join(base_data_path, 'OPEN.csv')) EQA_high = bt.AZ_Load_csv(os.path.join(base_data_path, 'HIGH.csv')) EQA_low = bt.AZ_Load_csv(os.path.join(base_data_path, 'LOW.csv')) EQA_close = bt.AZ_Load_csv(os.path.join(base_data_path, 'NEW.csv')) EQA_volume = bt.AZ_Load_csv(os.path.join(base_data_path, 'TVOL.csv')) EQA_amount = bt.AZ_Load_csv(os.path.join(base_data_path, 'TVALCNY.csv')) EQA_adj_r = bt.AZ_Load_csv( '/mnt/mfs/DAT_EQT/EM_Funda/DERIVED_14/aadj_r.csv') begin_str = '20100101' end_str = '20180401' # pool = Pool(20) # intraday_open_1_hour_vwap(begin_str, end_str) # intraday_create_factor(begin_str, end_str, factor_save_path) for sector in ['market_top_500']: