# """ # 因子描述性统计 factor_describe = {} for fac in fac_data.keys(): print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # 保存因子的描述性统计 factor_describe[fac] = fac_data[fac].T.describe().T # 打印因子的描述性统计均值 print(fac, fac_data[fac].T.describe().mean(axis=1)) print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # """ # 以下一日的开盘1小时tvwap到再下一日的开盘1小时tvwap收益率作为预测目标 begin = '2017-01-01' end = '2021-03-02' tvwap = fetch_data.fetch(begin, end, ['stock_twap_0930_1030']) # adjtvwap? fac_data['next_re'] = uc.ts_delay(tvwap['stock_twap_0930_1030'], -2) / uc.ts_delay( tvwap['stock_twap_0930_1030'], -1) - 1 fac_data['next_re'] = fac_data['next_re'].dropna(how='all') # 将每天的对应数据合并 new_f = {} for k, v in fac_data.items(): new_v = pd.DataFrame(v.stack()) new_v.columns = [k] new_f[k] = new_v new_f = pd.concat(new_f.values(), axis=1) f = open(data_pat + '/fac_reshape.pkl', 'wb') # 记得修改 pickle.dump(new_f, f, -1) f.close() # new_f.reset_index().to_csv(data_pat + '/fac_reshape.csv',index=False,encoding='gbk')
mine_summary = query_data.get_alphafactors_info(user='******') # 调整正负 factor_value_adj = {} for summa in mine_summary: if summa['factor_name'] in list(factor_value.keys()): if 'IC' in list(summa['perf']['1_d'].keys()): factor_value_adj[summa['factor_name']] = factor_value[ summa['factor_name']] * uc.sign(summa['perf']['1_d']['IC']) else: factor_value_adj[summa['factor_name']] = factor_value[ summa['factor_name']] * uc.sign( summa['perf']['1_d']['ic-mean']) # 建立股票在未来n日的涨跌标签 oc_data = fetch_data.fetch(begin, end, ['stock_adjopen', 'stock_adjclose']) ud_tag = uc.ts_delay(oc_data['stock_adjclose'], -pred_window) / uc.ts_delay( oc_data['stock_adjopen'], -1) - 1 # 以第二日的开盘价买入 ud_tag = ud_tag.mask(ud_tag > 0, 1) ud_tag = ud_tag.mask(ud_tag < 0, 0) # 股票因子值的reshape new_f = {} for k, v in factor_value_adj.items(): new_v = pd.DataFrame(v.stack()) new_v.columns = [k] new_f[k] = new_v new_f = pd.concat(new_f.values(), axis=1) # 滚动生成上涨概率预测 prediction = {} bay = GaussianNB()
for date in trade_days: sub_data = new_f.loc[date, ] model = sm.OLS(sub_data.iloc[:, -1], sm.add_constant(sub_data.iloc[:, 0:-1]), missing='drop').fit() coef[date] = model.params R_sq[date] = model.rsquared_adj print(date) coef_param = pd.concat(coef.values(), axis=1, keys=coef.keys()) coef_param = pd.DataFrame(coef_param.values.T, index=coef_param.columns, columns=coef_param.index) # 转置 r2_param = pd.DataFrame(R_sq.values(), index=R_sq.keys(), columns=['R_square_adj']) coef_param = uc.ts_delay(coef_param, 2) # 2天后才能用估计出的参数 r2_param = uc.ts_delay(r2_param, 2) # 2天后才能用估计出的参数 coef_param = coef_param.groupby( coef_param.index.strftime('%Y-%m')).mean() # 每个月更新一次权重 r2_param = r2_param.groupby( r2_param.index.strftime('%Y-%m')).mean() # 每个月更新一次权重 plt.figure() plt.plot(r2_param.index, r2_param['R_square_adj']) plt.show() coef_param.to_csv(data_pat + '/linear_regress_m/coef_param.csv', encoding='gbk') r2_param.to_csv(data_pat + '/linear_regress_m/r2_param.csv', encoding='gbk') # 画出因子暴露时间序列 le = np.size(coef_param, 0) la = math.ceil(4 * (le / 100)**(2 / 9))
la = math.ceil(4*(le/100)**(2/9)) for coef_name in coef_param.columns: plt.figure() plt.plot(coef_param.index, coef_param[coef_name]) plt.plot(coef_param.index, coef_param[coef_name].rolling(20).mean()) plt.title(coef_name, fontproperties="SimSun") plt.show() model = sm.OLS(coef_param[coef_name], [1 for i in range(le)]).fit(cov_type='HAC', cov_kwds={'maxlags': la}) print(model.summary()) # 有些因子的系数显著为负?多因子回归的影响 # 求收益率预测值(只用最近一次截面回归得到的系数) fac = {} new_f['const'] = 1 new_f = new_f.drop(['next_re'], axis=1) coef_param2 = pd.concat([new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param, 2)], axis=1) # 2天后才能用估计出的参数 coef_param2 = coef_param2.set_index([coef_param2.index, 'level_1']) pred = (coef_param2 * new_f).sum(axis=1, min_count=2) # 至少包含一个变量和一个const pred = pred.unstack() pred = pred.dropna(how='all') fac['nearest'] = pred f = open(data_pat + '/linear_regress_7/nearest/fac.pkl', 'wb') # 记得修改 pickle.dump(fac, f, -1) f.close() # 求收益率预测值(用过去20日截面回归得到的系数的平均值) fac = {} coef_param3 = pd.concat([new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param.rolling(20).mean(), 2)], axis=1) # 2天后才能用估计出的参数 coef_param3 = coef_param3.set_index([coef_param3.index, 'level_1']) pred2 = (coef_param3 * new_f).sum(axis=1, min_count=2) # 至少包含一个变量和一个const
plt.plot(coef_param.index, coef_param[coef_name]) plt.plot(coef_param.index, coef_param[coef_name].rolling(20).mean()) plt.title(coef_name, fontproperties="SimSun") plt.show() model = sm.OLS(coef_param[coef_name], [1 for i in range(le)]).fit(cov_type='HAC', cov_kwds={'maxlags': la}) print(model.summary()) # 有些因子的系数显著为负?多因子回归的影响 # 求收益率预测值(只用最近一次截面回归得到的系数) fac = {} new_f['const'] = 1 new_f = new_f.drop(['next_re'], axis=1) coef_param2 = pd.concat( [new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param, 2)], axis=1) # 2天后才能用估计出的参数 coef_param2 = coef_param2.set_index([coef_param2.index, 'level_1']) pred = (coef_param2 * new_f).sum(axis=1, min_count=2) # 至少包含一个变量和一个const pred = pred.unstack() pred = pred.dropna(how='all') fac['nearest'] = pred f = open(data_pat + '/linear_regress_22/nearest/fac.pkl', 'wb') # 记得修改 pickle.dump(fac, f, -1) f.close() # 求收益率预测值(用过去20日截面回归得到的系数的平均值) fac = {} coef_param3 = pd.concat([ new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param.rolling(20).mean(), 2)
from copy import deepcopy import numpy as np import time import json from collections import Counter data_pat = 'E:/FT_Users/LihaiYang/Files/factor_comb_data/fac_meaning/5group' # 记得修改 # 计算未来1、3、5、10、20日收益率,以开盘1小时tvwap为标准 begin = '2015-01-01' # 记得修改 end = '2020-02-28' end1 = '2019-12-31' data = fetch_data.fetch(begin, end, ['stock_adjtwap_0930_1030']) index_data = fetch_data.fetch(begin, end, ['index_close'], '000905') stock_re = {} stock_re['1_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'], -2) / uc.ts_delay( data['stock_adjtwap_0930_1030'], -1) - 1 stock_re['3_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'], -4) / uc.ts_delay( data['stock_adjtwap_0930_1030'], -1) - 1 stock_re['5_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'], -6) / uc.ts_delay( data['stock_adjtwap_0930_1030'], -1) - 1 stock_re['10_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'], -11) / uc.ts_delay( data['stock_adjtwap_0930_1030'], -1) - 1 stock_re['20_d'] = uc.ts_delay(data['stock_adjtwap_0930_1030'], -21) / uc.ts_delay( data['stock_adjtwap_0930_1030'], -1) - 1 trade_days = query_data.get_trade_days('d', from_trade_day=begin,
plt.figure() plt.plot(r2_param.index, r2_param['R_square_adj']) plt.plot(r2_param.index, r2_param['R_square_adj'].rolling(20).mean()) plt.show() coef_param.to_csv(data_pat + '/ols/coef_param.csv', encoding='gbk') r2_param.to_csv(data_pat + '/ols/r2_param.csv', encoding='gbk') # 求收益率预测值 new_f['const'] = 1 new_f = new_f.drop(['stock_rela'], axis=1) fac = {} # 只用最近一次截面回归得到的系数 coef_param2 = pd.concat( [new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param, 11)], axis=1) # 11天后才能用估计出的参数,记得修改 coef_param2 = coef_param2.set_index([coef_param2.index, 'level_1']) pred = (coef_param2 * new_f).sum(axis=1, min_count=1) pred = pred.unstack() pred = pred.dropna(how='all') fac['nearest'] = pred # 用过去20日截面回归得到的系数的平均值 coef_param3 = pd.concat([ new_f.reset_index(level=1).iloc[:, 0], uc.ts_delay(coef_param.rolling(20).mean(), 11) ], axis=1) # 11天后才能用估计出的参数,记得修改 coef_param3 = coef_param3.set_index([coef_param3.index, 'level_1']) pred2 = (coef_param3 * new_f).sum(axis=1, min_count=1)