def get_all_data(self,factor_name): timeline = ts.get_trading_date(self.start, self.end) ret = pd.read_parquet('/home/sharedFold/zhaorui/ret.parquet') read_path = r'/home/xiaonan/factor_wxn/factor/' read_path = os.path.join(read_path, factor_name) ans = [] for date in timeline: path = read_path + r'/' + str(date) + '.csv' if os.path.exists(path): factor = pd.read_csv(path, index_col=None, header=None) else: continue factor.columns = ['code', 'nouse','values'] factor.loc[:,'date'] = date factor = factor[factor['code'].isin(self.pool.loc[date,'code'].values)].copy() ans.append(factor) factor = pd.concat(ans) factor.reset_index(inplace = True) factor.drop(['index','nouse'], axis = 1, inplace = True) self.allData = pd.merge(factor, ret, how = 'left', left_on=['code','date'], right_on=['code','dt'])
def factorTimeInterval(): ans = {} path = r'/home/xiaonan/factor_wxn/factor/' for i in os.listdir(path): if i == '.directory': continue ans[i] = {} _ = os.path.join(path, i) datelist = os.listdir(_) datelist = [int(i[:8]) for i in datelist if i != '.directory'] start = np.min(datelist) end = np.max(datelist) t = ts.get_trading_date(start, end) ans[i]['start'] = start ans[i]['end'] = end print('****************************************') print('factor_name: ', i) if len(t) != len(datelist): print('{}: the trading date is missing {}!!!'.format( i, len(t) - len(datelist))) print('start time: ', start) print('end time: ', end) print('****************************************') pd.DataFrame(ans).T.to_csv('factorTimeInterval.csv')
def get_factor(self, start, end, factor_name, delay): def get_one(date): path = read_path + r'/' + str(date) + '.csv' factor = pd.read_csv(path, index_col=None, header=None) factor.columns = ['code', 'nouse', 'values'] factor.loc[:, 'date'] = ts.get_nxt_trading_dates(date, delay + 1)[-1] return factor timeline = ts.get_trading_date(start, end) read_path = os.path.join(self.factor_path, factor_name) ans = Parallel(10)(delayed(get_one)(date) for date in tqdm(timeline)) ''' for date in timeline: path = read_path + r'/' + str(date) + '.csv' factor = pd.read_csv(path, index_col=None, header=None) factor.columns = ['code', 'nouse','values'] factor.loc[:,'date'] = date ans.append(factor) ''' factor = pd.concat(ans) factor.reset_index(inplace=True) factor.drop(['index', 'nouse'], axis=1, inplace=True) return factor
def regression(start, end, factor, ret): path = r'/home/xiaonan/factor_wxn/factor/' path = os.path.join(path, factor) x = {} y = {} for date in ts.get_trading_date(start, end): _ans = pd.read_csv(path + r'/' + str(date) + r'.csv', index_col=0, header=None) _ans.columns = ['nouse', 'values'] x[date] = _ans.loc[:, 'values'] y[date] = ret[['code', 'y_close_5' ]][ret['dt'] == date].set_index('code')['y_close_5'] x = pd.DataFrame(x).T.fillna(0) y = pd.DataFrame(y).T.fillna(0) code_list = x.columns y = y.loc[:, x.columns] x = x.to_dict('series') y = y.to_dict('series') beta = {} for code in code_list: _x = sm.add_constant(x[code]) _y = y[code] model = sm.OLS(_y, _x) r = model.fit() beta[code] = r.params.iloc[-1] beta = pd.DataFrame(beta, index=['beta']).T lambdai = {} y = pd.DataFrame(y).T.to_dict('series') for date in ts.get_trading_date(start, end): model = sm.OLS(y[date], sm.add_constant(beta)) r = model.fit() lambdai[date] = r.params.iloc[-1] lambdai = pd.DataFrame(lambdai, index=['lambda']).T lambdai.to_csv('lambda.csv') print(lambdai.mean())
def count_nan(): ans = [] timeline = ts.get_trading_date(start, end) path = r'/home/xiaonan/factor_wxn/rawFactor/' for factor_name in tqdm(os.listdir(path)): if factor_name == '.directory': continue for date in timeline: read_path = path + factor_name + r'/' + str(date) + r'.csv' data = pd.read_csv(read_path, index_col=0, header=None) data.columns = ['nouse', 'values'] if data['values'].isnull().sum() >= 30: ans.append((factor_name, date)) np.save('list', ans)
def run(self): """ start:int end:int factor_list:a list contains some functions """ timeline = ts.get_trading_date(self.start, self.end) for date in tqdm(timeline): # ******************** load data and preprocessing ******************** start = datetime.datetime.now() x1, x3, x4 = datetime.timedelta(0), datetime.timedelta( 0), datetime.timedelta(0) stkdata = data.data._get_snap(date) x1 = datetime.datetime.now() - start time1 = datetime.datetime.now() save_list0 = [] # save raw factor save_list1 = [] # save nor factor MAX_WORKERS = 12 res = Parallel(n_jobs=MAX_WORKERS)( delayed(self.pipiline)(func, stkdata) for func in tqdm(self.factor_list)) # ******************** save raw data and normalization data ******************** for factor, _factor, func in res: save_list0.append((factor, date, func.__name__)) save_list1.append((_factor, date, func.__name__)) time2 = datetime.datetime.now() x3 += time2 - time1 time5 = datetime.datetime.now() tools.tools.save2raw(save_list0) tools.tools.save2nor(save_list1) end = datetime.datetime.now() x4 = end - time5 print('snapshot data') print('load data: ', x1) print('cal factor: ', x3) print('save factor: ', x4) print('all time: ', end - start)
def group_test(self, start, end, factor_name, n=5, trading_settlement='close2close', delay=0, day=1, plot=True): self.preprocessing(start, end, factor_name, delay) _factor = self.factor[factor_name].copy() allData = pd.merge(_factor, self.ret, how='left', left_on=['code', 'date'], right_on=['code', 'dt']) allData = allData.values timeline = ts.get_trading_date(start, end) _ret = {} _val = {} for i in range(allData.shape[0]): if allData[i, 2] in _ret.keys(): pass else: _ret[allData[i, 2]] = [] _val[allData[i, 2]] = [] if np.isnan(allData[i, 5]): _ret[allData[i, 2]].append(0) else: _ret[allData[i, 2]].append(allData[i, 5]) if np.isnan(allData[i, 1]): _val[allData[i, 2]].append(0) else: _val[allData[i, 2]].append(allData[i, 1]) for i in range(n): locals()['group_{}'.format(i)] = [] for i in tqdm(timeline): _ret_i = np.array(_ret[i]) _val_i = np.array(_val[i]) percent = [] for j in range(n + 1): percent.append(np.percentile(_val_i, 100 / n * j)) percent[-1] += 1 for j in range(n): lay = (_val_i >= percent[j]) & (_val_i < percent[j + 1]) if np.sum(lay) > 0: if np.isnan(np.mean(_ret_i[lay])): locals()['group_{}'.format(j)].append(np.mean(_ret_i)) else: locals()['group_{}'.format(j)].append( np.mean(_ret_i[lay])) else: locals()['group_{}'.format(j)].append(np.mean(_ret_i)) if os.path.exists(self.write_path + factor_name + r'/'): pass else: os.mkdir(self.write_path + factor_name + r'/') group = {} for i in range(n): group[str(i)] = locals()['group_{}'.format(i)] _ = ((pd.DataFrame(group, index=list(map(str, timeline))) + 1)).cumprod() # _ = (pd.DataFrame(group, index = list(map(str, timeline)))).cumsum() + 1 pd.DataFrame(group, index=list(map(str, timeline))).to_csv('group.csv') if plot: self.ReportFig.groupFig(_.apply(lambda x: x - x.mean(), axis=1), self.write_path + factor_name + r'/') return group
def ratio(self, start, end, factor_name, freq='3M', trading_settlement='close2close', delay=0, day=1): fre = int(freq[:-1]) fee = 0.9995 group = self.group_test(start, end, factor_name, trading_settlement='close2close', delay=0, day=1, plot=False) l = np.cumsum(np.array(group['0'])) + 1 s = np.cumsum(np.array(group['4'])) + 1 longshort = np.cumsum(-np.array(group['4']) + np.array(group['0'])) + 1 ''' fig,ax = plt.subplots() ax.plot(longshort, color = 'y') ax.plot(l,color = 'b') ax.plot(s,color = 'r') plt.show(fig) ''' # l = np.cumprod((np.array(group['0']) + 1) * fee) # s = np.cumprod((np.array(group['0']) + 1) * fee) # longshort = np.cumprod((1 - np.array(group['4']) + np.array(group['0'])) * fee) timeline = ts.get_trading_date(start, end) timeline = np.array(timeline) time_split = [timeline[0]] while (time_split[-1] <= timeline[-1]): Y, a = divmod(time_split[-1], 10000) M, D = divmod(a, 100) time_split.append(10000 * (Y + (M + fre) // 12) + 100 * ((M + fre) % 12) + D) ans = { 'DateRange': {}, 'TradingDays': {}, 'ReturnRatio': {}, 'LongReturnRatio': {}, 'ShortReturnRatio': {}, 'SharpeRatio': {}, 'WinRatio': {}, 'ProfitCrossRatio': {}, 'MaxDrawdown': {}, 'CalmarRatio': {}, } for i in range(len(time_split) - 1): lay = (timeline >= time_split[i]) & (timeline < time_split[i + 1]) time = timeline[lay] data = longshort[lay] longdata = l[lay] shortdata = s[lay] ans['DateRange'][i] = '{}~{}'.format(time_split[i], time_split[i + 1]) ans['TradingDays'][i] = self.CRatio.dateRange(time) ans['ReturnRatio'][i], ans['LongReturnRatio'][i], ans[ 'ShortReturnRatio'][i] = self.CRatio.allRet( time, data, longdata, shortdata) ans['SharpeRatio'][i] = self.CRatio.sharpe(time, data) ans['WinRatio'][i] = self.CRatio.win_ratio(time, data) ans['ProfitCrossRatio'][i] = self.CRatio.pcr(time, data) ans['MaxDrawdown'][i] = self.CRatio.maxdrawdown(time, data) ans['CalmarRatio'][i] = self.CRatio.calmar(time, data) # print dataframe printDataFrame(pd.DataFrame(ans)) ''' x = 18 for i in pd.DataFrame(ans).columns: print(i + (x+1 - len(i)) * ' ' + '|', end = '') print('') for i,j in pd.DataFrame(ans).iterrows(): for k in j.index: if isinstance(j[k], str): res = j[k] else: res = str(np.round(j[k],4)) print(res, (x - len(res)) * ' ' + '|', end = '') print('') ''' pd.DataFrame(ans).to_csv(self.write_path + factor_name + r'/ratio.csv', index=None)
def ic_test(self, start, end, factor_name, trading_settlement='close2close', delay=0, day=1): self.preprocessing(start, end, factor_name, delay) _factor = self.factor[factor_name].copy() allData = pd.merge(_factor, self.ret, how='left', left_on=['code', 'date'], right_on=['code', 'dt']) # print(allData.columns) allData = allData.values timeline = ts.get_trading_date(start, end) _ret = {} _val = {} for i in range(allData.shape[0]): if allData[i, 2] in _ret.keys(): pass else: _ret[allData[i, 2]] = [] _val[allData[i, 2]] = [] if np.isnan(allData[i, 5]): _ret[allData[i, 2]].append(0) else: _ret[allData[i, 2]].append(allData[i, 5]) if np.isnan(allData[i, 1]): _val[allData[i, 2]].append(0) else: _val[allData[i, 2]].append(allData[i, 1]) kinds = ['AllIC', 'xBottom', 'xTop', 'yBottom', 'yTop'] ic = {} for i in kinds: ic[i] = [] for i in tqdm(timeline): a = paraCorr.one_corr(np.array(_ret[i]), np.array(_val[i])) if ic['AllIC']: ic['AllIC'].append(ic['AllIC'][-1] + a) else: ic['AllIC'].append(a) lay = (np.array(_ret[i]) > np.median(_ret[i])) ytop = paraCorr.one_corr( np.array(_ret[i])[lay], np.array(_val[i])[lay]) if ic['yTop']: ic['yTop'].append(ic['yTop'][-1] + ytop) else: ic['yTop'].append(ytop) lay = (np.array(_ret[i]) < np.median(_ret[i])) ybottom = paraCorr.one_corr( np.array(_ret[i])[lay], np.array(_val[i])[lay]) if ic['yBottom']: ic['yBottom'].append(ic['yBottom'][-1] + ybottom) else: ic['yBottom'].append(ybottom) lay = (np.array(_val[i]) > np.median(_val[i])) xtop = paraCorr.one_corr( np.array(_ret[i])[lay], np.array(_val[i])[lay]) if ic['xTop']: ic['xTop'].append(ic['xTop'][-1] + xtop) else: ic['xTop'].append(xtop) lay = (np.array(_val[i]) < np.median(_val[i])) xbottom = paraCorr.one_corr( np.array(_ret[i])[lay], np.array(_val[i])[lay]) if ic['xBottom']: ic['xBottom'].append(ic['xBottom'][-1] + xbottom) else: ic['xBottom'].append(xbottom) if os.path.exists(self.write_path + factor_name + r'/'): pass else: os.mkdir(self.write_path + factor_name + r'/') self.ReportFig.icFig(pd.DataFrame(ic, index=list(map(str, timeline))), self.write_path + factor_name + r'/') pd.DataFrame(ic, index=list(map(str, timeline))).to_csv('ic.csv') return ic
def portfolio_beta(start, end, factor_list, inSample=100, outSample=10, univ_name='TOP2000'): NUM = 200 print('loading data......') ret = pd.read_parquet('/home/sharedFold/zhaorui/ret.parquet') pool = dM.load_universe(start, end, univ_name=univ_name) timeline = ts.get_trading_date(start, end) allFactor = pd.DataFrame() for factor_name in factor_list: read_path = '/home/xiaonan/factor_wxn/factor/' + factor_name + r'/' ans = [] for date in timeline: data = pd.read_csv(read_path + str(date) + r'.csv', header=None) data.drop([1], inplace=True, axis=1) data.columns = ['code', factor_name] data.loc[:, 'date'] = date ans.append(data) ans = pd.concat(ans) if allFactor.empty: allFactor = ans.copy() else: allFactor = pd.merge(allFactor, ans, left_on=['code', 'date'], right_on=['code', 'date'], how='outer') allData = pd.merge(allFactor, ret.loc[:, ['y_close_1', 'code', 'dt']], left_on=['code', 'date'], right_on=['code', 'dt'], how='left') print('loading data end\n') print('backtest started') allInSample, allOutSample, allNeutral = [], [], [] predict_days = np.arange(inSample, len(timeline), outSample) for i in tqdm(predict_days): inPool = pool.loc[timeline[i - inSample]:timeline[i]] if i + outSample < len(timeline): outPool = pool.loc[timeline[i]:timeline[i + outSample]] else: outPool = pool.loc[timeline[i]:] data_inSample = allData[ allData.code.isin(np.unique(inPool.values.flatten())) & allData.date.isin(timeline[i - inSample:i])].copy() data_outSample = allData[ allData.code.isin(np.unique(outPool.values.flatten())) & allData.date.isin(timeline[i:i + outSample])].copy() x = data_inSample.loc[:, factor_list].fillna( data_inSample.loc[:, factor_list].mean()) y = data_inSample.y_close_1.fillna(data_inSample.y_close_1.mean()) from sklearn.ensemble import RandomForestRegressor model = LinearRegression() r = model.fit(y=y.to_frame(), X=x) x = data_inSample.loc[:, factor_list].fillna( data_inSample.loc[:, factor_list].mean()) predict_inSample = r.predict(x) data_inSample.loc[:, 'predict'] = predict_inSample x = data_outSample.loc[:, factor_list].fillna( data_outSample.loc[:, factor_list].mean()) predict_outSample = r.predict(x) data_outSample.loc[:, 'predict'] = predict_outSample _in = data_inSample.groupby('date').apply( lambda x: x[['predict', 'y_close_1']].sort_values('predict').iloc[ -NUM:, 1].mean()).copy() _out = data_outSample.groupby('date').apply( lambda x: x[['predict', 'y_close_1']].sort_values('predict').iloc[ -NUM:, 1].mean()).copy() _out_neutral = _out - data_outSample.groupby('date').apply( lambda x: x['y_close_1'].mean()).copy() allInSample.append(_in.loc[timeline[i - inSample]:timeline[i]]) allOutSample.append(_out) allNeutral.append(_out_neutral) allInSample = pd.concat(allInSample).sort_index() allInSample.index = allInSample.index.map(str) allOutSample = pd.concat(allOutSample).sort_index() allOutSample.index = allOutSample.index.map(str) allNeutral = pd.concat(allNeutral).sort_index() allNeutral.index = allNeutral.index.map(str) fig, ax = plt.subplots(3, 1) ax[0].plot((0.9997 * (1 + allInSample)).cumprod(), color='r') ax[1].plot((0.9997 * (1 + allOutSample)).cumprod(), color='b') ax[2].plot((0.9997 * (1 + allNeutral)).cumprod(), color='y') for t in [0, 1, 2]: for i, tick in enumerate(ax[t].get_xticklabels()): if i % 128 == 0: tick.set_visible(True) tick.set_rotation(30) else: tick.set_visible(False) for i, tick in enumerate(ax[t].get_xticklines()): if i % 128 == 0: tick.set_visible(True) else: tick.set_visible(False) fig.savefig('model_beta.jpg') # plt.show(fig) return (0.9997 * (1 + allNeutral)).cumprod()
def plot_IC(start, end, name, retinterval='y_close_5', plot=False): plt.rcParams['figure.figsize'] = (18, 6) plt.rcParams['axes.unicode_minus'] = False plt.rcParams['xtick.direction'] = 'in' plt.rcParams['ytick.direction'] = 'in' timeline = ts.get_trading_date(start, end) ret = pd.read_parquet('/home/sharedFold/zhaorui/ret.parquet') # ret = ret[ret['dt'].isin(timeline)].copy() ic = {} ic['ic_values'] = {} read_path = r'/home/xiaonan/factor_wxn/factor/' read_path = os.path.join(read_path, name) pool = dM.load_universe(start, end, univ_name='TOP2000') pool = pool.apply(lambda x: x.apply(lambda x: int(x))).copy() for date in timeline: if os.path.exists(read_path + r'/' + str(date) + '.csv'): factor = pd.read_csv(read_path + r'/' + str(date) + '.csv', index_col=0, header=None) else: continue factor.columns = ['nouse', 'values'] _ret = ret[ret['dt'] == date].set_index('code').copy() comindex = _ret.index & factor.index _ic = factor.loc[pool.loc[date, 'code'].values, ['values']].corrwith(_ret.loc[pool.loc[date, 'code'].values, retinterval], method='spearman').iloc[0] ic['ic_values'][str(date)] = _ic ans = pd.DataFrame(ic).sort_index() print(ans.mean().iloc[0]) fig, ax = plt.subplots() ax.bar(x=ans.index, height=ans['ic_values'].apply(lambda x: x if x >= 0 else 0).values, color='#13CCB1', label='ic(left +)') ax.bar(x=ans.index, height=ans['ic_values'].apply(lambda x: x if x < 0 else 0).values, color='#EACC80', label='ic(left -)') for i in ['top', 'bottom', 'left', 'right']: ax.spines[i].set_visible(True) ax.yaxis.grid(linestyle='--', alpha=0.3) ax.set_title('{} {}'.format(name, round(ans.mean().iloc[0], 3))) ax.legend(loc='upper left') ax1 = ax.twinx() ax1.plot(ans.cumsum(), color='darkgrey', label='acc_ic(right)') ax1.legend(loc='upper right') for i, tick in enumerate(ax.get_xticklabels()): if i % 128 == 0: tick.set_visible(True) tick.set_rotation(30) else: tick.set_visible(False) for i, tick in enumerate(ax.get_xticklines()): if i % 128 == 0: tick.set_visible(True) else: tick.set_visible(False) plt.savefig('./rankIcFig/{}.jpg'.format(name)) if plot: plt.show(fig)