def cal_factor_group_return(factor_data, periods=(20,), prices=None, group_by=None, quantiles=5, freq='1d', **kwargs): """基于alphalens计算因子收益率""" stocklist = pd.DataFrame(np.ones(len(factor_data)), index=factor_data.index, columns=['stocklist']) stocklist = stockFilter.typical(stocklist) factor_data = factor_data.reindex(stocklist.index) start = factor_data.index.get_level_values('date').min() start = tc.tradeDayOffset(start, -5) end = factor_data.index.get_level_values('date').max() end = tc.tradeDayOffset(end, max(periods)+1, freq=freq) if prices is None: prices = data_source.load_factor('adj_close', '/stocks/', start_date=start, end_date=end)['adj_close'].unstack() elif isinstance(prices, pd.DataFrame): if prices.index.nlevels == 2: prices = prices.iloc[:, 0].unstack() else: raise ValueError('prices 格式非法!') if freq != '1d': date_index = tc.get_trade_days(start, end, freq, retstr=None) prices = prices.reindex(date_index, copy=False) if_groupby = group_by is not None merge_data = get_clean_factor_and_forward_returns(factor_data, prices, group_by, periods=periods, binning_by_group=if_groupby, **kwargs) return merge_data
def test_getFactorsAlphalens(self): factorDf, prices = self.getDataDictOfMatrixAlphalens( instrumentList=self.instrumentList, ratioList=self.ratio_list, fromDate=self.fromDate, toDate=self.toDate) # factorDf, prices = self.factor_service.getDataDictOfMatrixAlphalens(instrumentList=self.instrumentList, # ratioList=self.ratio_list, # fromDate=self.fromDate, # toDate=self.toDate # ) self.assertIsNotNone(factorDf) self.assertIsNotNone(prices) # self.saveDataDict(factorDf, prices) factor = factorDf[self.ratio_list[0]] self.assertIsNotNone(factor) # factor_data = addFactorReturns(prices, factorDf, n_fwd_days=3) factor_data = get_clean_factor_and_forward_returns( factor, prices, # groupby=self.factor_groups, quantiles=None, bins=2, periods=[3], filter_zscore=None, max_loss=1 # .35 ) self.assertIsNotNone(factor_data)
def test_performance(factor, prices): import matplotlib.pyplot as plt from alphalens import utils, performance, plotting # 持股收益-逐只 stocks_holding_return = utils.get_clean_factor_and_forward_returns(factor, prices, quantiles=5, periods=(1, 5, 10)) print("因子的IC值:") ic = performance.factor_information_coefficient(stocks_holding_return) print(ic) plotting.plot_ic_hist(ic) plt.show() plotting.plot_ic_ts(ic) plt.show() print("平均IC值-月:") mean_ic = performance.mean_information_coefficient(stocks_holding_return, by_time="M") plotting.plot_monthly_ic_heatmap(mean_ic) plt.show() # 按quantile区分的持股平均收益(减去了总体平均值) mean_return_by_q = performance.mean_return_by_quantile(stocks_holding_return, by_date=True, demeaned=True)[0] # 按quantile画出累积持有收益 for i in [1, 5, 10]: plotting.plot_cumulative_returns_by_quantile(mean_return_by_q, period=i) plt.show()
def cal_ic_by_alphalens(factor_data, prices=None, group_by=None, periods=(20,), **kwargs): """调用alphalens计算因子IC """ factor_data = factor_data.copy() if isinstance(factor_data, pd.DataFrame): factor_data = factor_data.iloc[:, 0] factor_data.index.names = ['date', 'asset'] if prices is None: start = factor_data.index.get_level_values('date').min() start = tc.tradeDayOffset(start, -5) end = factor_data.index.get_level_values('date').max() end = tc.tradeDayOffset(end, max(periods)) prices = data_source.load_factor('adj_close', '/stocks/', start_date=start, end_date=end)['adj_close'].unstack() elif isinstance(prices, pd.DataFrame): if prices.index.nlevels == 2: prices = prices.iloc[:, 0].unstack() else: raise ValueError('prices 格式非法!') merge_data = get_clean_factor_and_forward_returns(factor_data, prices, group_by, periods=periods, **kwargs) by_group = group_by is not None ic = factor_information_coefficient(merge_data, group_adjust=False, by_group=by_group) return ic
def _get_clean_factor_and_fwd_return(self, factor_score, freq): price = get_sec_price(start_date=str(self._tiaocang_date[0])[:10], end_date=str(self._tiaocang_date[-1])[:10], sec_ids=self._sec_ID, data_source=self._data_source, freq=freq) factor_and_return = get_clean_factor_and_forward_returns( factor=factor_score, prices=price, groupby_labels=IndexComp.get_industry_name_dict(), periods=self._periods) return factor_and_return
def get_IC_score(all_data, price): IC_score = pd.DataFrame() IC_ = pd.DataFrame() for factor in all_data.columns: single_factor_series = all_data[factor] factor_return = utils.get_clean_factor_and_forward_returns( single_factor_series, price, max_loss=0.99) IC = performance.factor_information_coefficient(factor_return) a = IC.iloc[:, 1] IC_ = pd.concat([IC_, IC.iloc[:, 1]]) IC = pd.Series([ IC.mean()[1], len(a[a > 0.02]) / len(a), performance.factor_returns(factor_return).iloc[:, 1].mean(), IC.mean()[1] / IC.std()[1] ]) IC_score = IC_score.append(IC, ignore_index=True) IC_score.columns = [['IC_mean', 'perc_above_0.02', 'average_return', 'IR']] IC_score['factor'] = all_data.columns return IC_score, IC_
def factor_and_forward_returns(self): feature = self.feature.reset_index() feature = feature.assign( date=pd.to_datetime(feature.date)).set_index('date') feature.index = feature.index.tz_localize('UTC') feature = feature.reset_index().set_index(['date', 'code']) panelprice = deepcopy(self.stock_data.closepanel) panelprice.index = pd.to_datetime(panelprice.index).tz_localize('UTC') return get_clean_factor_and_forward_returns(feature, panelprice, groupby=None, binning_by_group=False, quantiles=10, bins=None, periods=(1, 5, 10), filter_zscore=20, groupby_labels=None, max_loss=0.15, zero_aware=False, cumulative_returns=True)
2: 'small_MC', 3: 'mid_MC', 4: 'big_MC', 5: 'very_big_MC' } mc_group.tail() # #### 下面就进行分析了,一共有 # ##### Quantiles Statistics,Returns Analysis,Information Analysis,Turnover Analysis # #### 整理数据成规定的格式: get_ipython().magic('pinfo utils.get_clean_factor_and_forward_returns') # 大致意思就是将上面得到的数据有:'因子数据','价格','市值数据','市值分组的标签'等数据(要符合规格)代入get_clean_factor_and_forward_returns这个函数就可以得到 一个多重索引的dataframe,包含了alpha(在factor那列),每个时期的预期收益(1,5,10),因子分组的组号(factor_quantile),可能还会有按另一个因子(此处是市值)的分组(group) facs_data_analysis = utils.get_clean_factor_and_forward_returns( series_facs_datas, price, groupby=mc_group, groupby_labels=mc_label) # 由于factor那列是object类型,转换成float方便下面继续分析 facs_data_analysis['factor'] = np.float128(facs_data_analysis['factor']) # 1、 查看一下summary get_ipython().magic('pinfo tears.create_summary_tear_sheet') # 返回的是一个简易的summary包含(Quantiles Statistics,Returns Analysis,Information Analysis,Turnover Analysis) tears.create_summary_tear_sheet(facs_data_analysis) # 2、 Returns Analysis get_ipython().magic('pinfo tears.create_returns_tear_sheet') # 收益率分析:
pipe.add(Alpha1(), 'alpha1') from zipline_extensions_cn.research import * start_date = '2019-01-04' end_date = '2020-06-01' results = run_pipeline(pipe, start_date, end_date) pricing_data = get_pricing( tickers=results.index. levels[1], # Finds all assets that appear at least once in "factor_data" start_date=start_date, end_date= end_date, # must be after run_pipeline()'s end date. Explained more in lesson 4 field= 'close' # Generally, you should use open pricing. Explained more in lesson 4 ) from alphalens.utils import get_clean_factor_and_forward_returns merged_data = get_clean_factor_and_forward_returns( factor=results, prices=pricing_data, periods=(1, 2, 5), max_loss=0.6, quantiles=3, ) from alphalens.tears import * create_full_tear_sheet(merged_data)
# In[10]: with pd.HDFStore('../../data/assets.h5') as store: sp500 = store['sp500/prices'].close sp500 = sp500.resample('D').ffill().tz_localize('utc').filter( prices.index.get_level_values(0)) sp500.head() # We can create the alphalens input data in the required format using the `get_clean_factor_and_forward_returns` utility function that also returns the signal quartiles and the forward returns for the given holding periods: # In[11]: HOLDING_PERIODS = (5, 10, 21, 42) QUANTILES = 5 alphalens_data = get_clean_factor_and_forward_returns(factor=factor_data, prices=prices, periods=HOLDING_PERIODS, quantiles=QUANTILES) # The `alphalens_data` `DataFrame` contains the returns on an investment in the given asset on a given date for the indicated holding period, as well as the factor value, that is, the asset's `MeanReversion` ranking on that date, and the corresponding quantile value: # In[12]: alphalens_data.head() # The forward returns and the signal quantiles are the basis for evaluating the predictive power of the signal. Typically, a factor should deliver markedly different returns for distinct quantiles, such as negative returns for the bottom quintile of the factor values and positive returns for the top quantile. # ## Summary Tear Sheet # In[13]: create_summary_tear_sheet(alphalens_data)
6. 收益指标:回测收益,回测年化收益,基准收益,基准年化收益 风险指标:最大回撤越小越好(30%以内), 夏普比率越大越好(1以上) """ import pandas as pd import numpy as np import scipy.stats as st from alphalens import tears, performance, plotting, utils df = pd.DataFrame([[1, 2], [4, 5]], columns=["A", "B"]) # 计算斯皮尔相关系数Rank IC,取值 [-1, 1]之间 print(st.spearmanr(df["A"], df["B"])) """使用alphalens更简易的做因子分析""" # 输入因子表和收盘价表到返回到期收益率表,再将因子表和到期收益表整合返回综合因子数据表 factor_data = utils.get_clean_factor_and_forward_returns("factor", "price") # 因子IC的计算 IC = performance.factor_information_coefficient(factor_data) # 因子时间序列和移动平均图,看出一个因子在时间上的正负性、 plotting.plot_ic_ts(IC) # 因子分布直方图,IC平均值,标准差 plotting.plot_ic_hist(IC) # 热力图 mean_monthly_ic = performance.mean_information_coefficient(factor_data, by_time="1m") plotting.plot_monthly_ic_heatmap(mean_monthly_ic) # IC分析合集 tears.create_information_tear_sheet(factor_data) # 收益率分析 tears.create_returns_tear_sheet(factor_data) # 因子的每一期的收益(因子收益)
def alpha_factor_function(): return 10 def make_pipeline(): return Pipeline(columns={'column_name': alpha_factor_function()}) my_pipe = make_pipeline() pipeline_data = run_pipeline(my_pipe, start_date='2014-1-1', end_date='2016-1-1').dropna() # Alphalens (second cell) from alphalens.utils import get_clean_factor_and_forward_returns from alphalens.tears import create_full_tear_sheet pricing_data = get_pricing( symbols=pipeline_data.index. levels[1], # Finds all assets that appear at least once in the pipeline start_date='2014-1-1', end_date='2016-2-1', #1 trading day after end date of pipeline fields='open_price') merged_data = get_clean_factor_and_forward_returns( factor=pipeline_data['column_name'], prices=pricing_data) create_full_tear_sheet(merged_data)
return Pipeline(columns={ 'factor to analyze': factor_to_analyze, 'sector': sector }, screen=base_universe & sector.notnull() & factor_to_analyze.notnull()) factor_data = run_pipeline(make_pipeline(), '2012-1-1', '2019-1-1') pricing_data = get_pricing(factor_data.index.levels[1], '2012-1-1', '2020-6-1', fields='open_price') sector_labels, sector_labels[-1] = dict(Sector.SECTOR_NAMES), "Unknown" merged_data = get_clean_factor_and_forward_returns( factor=factor_data['factor to analyze'], prices=pricing_data, quantiles=5, groupby=factor_data['sector'], groupby_labels=sector_labels, binning_by_group=True, periods=(198, 252)) # week = 5, month = 21, quarter = 63, year = 252 mean_information_coefficient(merged_data).plot(title="IC Decay") create_information_tear_sheet(merged_data, by_group=True, group_neutral=True) create_returns_tear_sheet(merged_data, by_group=True, group_neutral=True) create_full_tear_sheet(merged_data)
def _get_clean_factor_and_fwd_return(self): factor = get_clean_factor_and_forward_returns(factor=self._factor, prices=self._price, groupby=self._industry, groupby_labels=IndexComp.get_industry_name_dict()) return factor
# stock_factor_1 = stock_factor_1.groupby('trade_date').apply(winsorize_series) # stock_factor_1 = stock_factor_1.groupby('trade_date').apply(standardize_series) # stock_factor_2 = stock_factor_2.groupby('trade_date').apply(winsorize_series) # stock_factor_2 = stock_factor_2.groupby('trade_date').apply(standardize_series) stock_factor = stock_factor_1 stock_factor = stock_factor.astype(float) # 申万行业分类 industry_dict = sw_industry.set_index('code')['index_code'].to_dict() industry_labels = sw_industry.set_index('index_code')['index_name'].to_dict() # 分析因子 factor_data = get_clean_factor_and_forward_returns(stock_factor, stock_close, groupby=industry_dict, quantiles=5, binning_by_group=False, periods=(1, 5, 10), filter_zscore=None) create_full_tear_sheet(factor_data, long_short=False, group_neutral=False, by_group=False) # create_summary_tear_sheet(factor_data, long_short=True, group_neutral=True) # create_event_returns_tear_sheet(factor_data, stock_close, avgretplot=(3, 11), # long_short=False, group_neutral=True, by_group=True) # create_event_study_tear_sheet(factor_data, stock_close) # returns, positions, benchmark_rets = create_pyfolio_input(factor_data, # period='1D', # capital=None,
import pandas as pd import tushare as ts from alphalens.utils import get_clean_factor_and_forward_returns from alphalens.tears import create_full_tear_sheet pro = ts.pro_api() df = pro.daily(ts_code='000001.SZ,600982.SH', start_date='20200101', end_date='20211122') df.index = pd.to_datetime(df['trade_date']) df.index.name = None df.sort_index(inplace=True) # 多索引的因子列,第一个索引为日期,第二个索引为股票代码 assets = df.set_index([df.index, df['ts_code']], drop=True) # column为股票代码,index为日期,值为收盘价 close = df.pivot_table(index='trade_date', columns='ts_code', values='close') close.index = pd.to_datetime(close.index) from alphalens.utils import get_clean_factor_and_forward_returns from alphalens.tears import create_full_tear_sheet ret = get_clean_factor_and_forward_returns(assets[['pct_chg']], close) create_full_tear_sheet(ret, long_short=False) # https://github.com/Ckend/alphalens
def get_factors_ic_df(self, factors_dict, pool, start, end, periods=(1, 5, 10), quantiles=None, bins=None, price=None): """ 获取指定周期下的多个因子ic值序列矩阵 :param factors_dict: 若干因子组成的字典(dict),形式为: {"factor_name_1":factor_1,"factor_name_2":factor_2} 每个因子值格式为一个MultiIndex Series,索引(index)为date(level 0)和asset(level 1), 包含一列factor值。 :param pool: 股票池范围(list),如:["000001.XSHE","600300.XSHG",......] :param start: 起始时间 (datetime) :param end: 结束时间 (datetime) :param periods: 指定持有周期(tuple),周期值类型为(int) :param quantiles: 根据因子大小将股票池划分的分位数量(int) :param price (optional): 包含了pool中所有股票的价格时间序列(pd.Dataframe),索引(index)为datetime,columns为各股票代码,与pool对应。 :return: ic_df_dict 指定的不同周期下的多个因子ic值序列矩阵所组成的字典(dict), 键为周期(int),值为因子ic值序列矩阵(ic_df)。 如:{1:ic_df_1,5:ic_df_5,10:ic_df_10} ic_df(ic值序列矩阵) 类型pd.Dataframe,索引(index)为datetime,columns为各因子名称,与factors_dict中的对应。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 """ from fxdayu_data import DataAPI import datetime import numpy as np from alphalens import utils, performance def get_price_data(pool, start, end, max_window=10): data = DataAPI.candle(tuple(pool), "D", start=start - datetime.timedelta(days=max_window), end=end + datetime.timedelta(days=max_window)) data = data.replace(to_replace=0, value=np.NaN) return data if (price is None): price_data = get_price_data(pool.tolist(), start, end, max_window=max(periods)) price = price_data.minor_xs("close") ic_dict = {} for factor_name in factors_dict.keys(): factor_value = factors_dict[factor_name] # 持股收益-逐只 stocks_holding_return = utils.get_clean_factor_and_forward_returns(factor_value, price, quantiles=quantiles, bins=bins, periods=periods) ic = performance.factor_information_coefficient(stocks_holding_return) ic_dict[factor_name] = ic # 获取factor_value的时间(index),将用来生成 factors_ic_df 的对应时间(index) times = sorted(pd.concat([pd.Series(factors_dict[factor_name].index.levels[0]) for factor_name in factors_dict.keys()]).unique()) ic_df_dict = {} for period in periods: ic_table = [] for factor_name in ic_dict.keys(): ic_by_period = pd.DataFrame(ic_dict[factor_name][period]) ic_by_period.columns = [factor_name, ] ic_table.append(ic_by_period) ic_df_dict[period] = pd.concat(ic_table, axis=1).dropna() ic_df_dict[period] = ic_df_dict[period].reindex(times) return ic_df_dict
lr_factor = get_factor(lr_predictions.predicted.swaplevel()) lr_factor.head() # In[10]: tickers = lr_factor.index.get_level_values('symbol').unique() # In[11]: trade_prices = get_trade_prices(tickers, 2014, 2017) trade_prices.info() # In[12]: lr_factor_data = get_clean_factor_and_forward_returns(factor=lr_factor, prices=trade_prices, quantiles=5, periods=(1, 5, 10, 21)) lr_factor_data.info() # In[13]: create_summary_tear_sheet(lr_factor_data) # ## Ridge Regression # In[14]: best_ridge_alpha = get_best_alpha(ridge_scores) ridge_predictions = ridge_predictions[ridge_predictions.alpha == best_ridge_alpha].drop('alpha', axis=1)