def __init__(self, price=None, ret=None, high=None, low=None, group=None, n_quantiles=5, mask=None, can_enter=None, can_exit=None, period=5, benchmark_price=None, forward=True, commission=0.0008): if price is None and ret is None: raise ValueError("One of price / ret must be provided.") if price is not None and ret is not None: raise ValueError("Only one of price / ret should be provided.") if ret is not None and benchmark_price is not None: raise ValueError( "You choose 'return' mode but benchmark_price is given.") if not (n_quantiles > 0 and isinstance(n_quantiles, int)): raise ValueError( "n_quantiles must be a positive integer. Input is: {}".format( n_quantiles)) self.price = price self.ret = ret self.high = high self.low = low self.group = group self.n_quantiles = n_quantiles if mask is not None: mask = jutil.fillinf(mask) mask = mask.astype(int).fillna(0).astype(bool) self.mask = mask if can_enter is not None: can_enter = jutil.fillinf(can_enter) can_enter = can_enter.astype(int).fillna(0).astype(bool) self.can_enter = can_enter if can_exit is not None: can_exit = jutil.fillinf(can_exit) can_exit = can_exit.astype(int).fillna(0).astype(bool) self.can_exit = can_exit self.period = period self.benchmark_price = benchmark_price self.forward = forward self.commission = commission self.signal_data = None self.signal_ret = None
def mad(factor_df, index_member=None): """ 对因子值做去极值操作 :param index_member: :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。 形如: AAPL BA CMG DAL LULU date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 :return:去极值后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。 """ def _mad(series): if series.dropna().size==0: return series median = series.median() tmp = (series - median).abs().median() return series.clip(median - 5 * tmp, median + 5 * tmp) factor_df = jutil.fillinf(factor_df) factor_df = _mask_non_index_member(factor_df, index_member) return factor_df.apply(lambda x: _mad(x), axis=1)
def winsorize(factor_df, alpha=0.05, index_member=None): """ 对因子值做去极值操作 :param index_member: :param alpha: 极值范围 :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。 形如: AAPL BA CMG DAL LULU date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 :return:去极值后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。 """ def winsorize_series(se): q = se.quantile([alpha / 2, 1 - alpha / 2]) se[se < q.iloc[0]] = q.iloc[0] se[se > q.iloc[1]] = q.iloc[1] return se factor_df = jutil.fillinf(factor_df) factor_df = _mask_non_index_member(factor_df, index_member) return factor_df.apply(lambda x: winsorize_series(x), axis=1)
def standardize(factor_df, index_member=None): """ 对因子值做z-score标准化-算样本方差选择自由度为n-1 :param index_member: :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。 形如: AAPL BA CMG DAL LULU date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 :return:z-score标准化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。 """ factor_df = jutil.fillinf(factor_df) factor_df = _mask_non_index_member(factor_df, index_member) return factor_df.sub(factor_df.mean(axis=1), axis=0).div(factor_df.std(axis=1), axis=0)
def test_pdutil(): df = pd.DataFrame(np.random.rand(4, 20)) df.iloc[1, 2] = np.nan df.iloc[3, 4] = np.nan df.iloc[1, 4] = np.nan assert df.isnull().sum().sum() == 3 df.iloc[2, 11] = np.inf df.iloc[2, 12] = -np.inf assert df.isnull().sum().sum() == 3 df2 = jutil.fillinf(df) assert df2.isnull().sum().sum() == 5 res_q = jutil.to_quantile(df, 5, axis=1) df3 = df.copy() df3['group'] = ['a', 'a', 'b', 'a'] dic = jutil.group_df_to_dict(df3, by='group') assert set(list(dic.keys())) == {'a', 'b'}
def standarize_factors(factors): if isinstance(factors, pd.DataFrame): factors_dict = {"factor": factors} else: factors_dict = factors factor_name_list = factors_dict.keys() for factor_name in factor_name_list: factors_dict[factor_name] = jutil.fillinf(factors_dict[factor_name]) factors_dict[factor_name] = process._mask_non_index_member(factors_dict[factor_name], index_member=index_member) if winsorization: factors_dict[factor_name] = process.winsorize(factors_dict[factor_name]) if standardize_type == "z_score": factors_dict[factor_name] = process.standardize(factors_dict[factor_name]) elif standardize_type == "rank": factors_dict[factor_name] = process.rank_standardize(factors_dict[factor_name]) elif standardize_type is not None: raise ValueError("standardize_type 只能为'z_score'/'rank'/None") return factors_dict
def rank_standardize(factor_df, index_member=None): """ 输入因子值, 将因子用排序分值重构,并处理到0-1之间(默认为升序——因子越大 排序分值越大(越好) :param index_member: :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。 形如: AAPL BA CMG DAL LULU date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 :return: 排序重构后的因子值。 取值范围在0-1之间 """ factor_df = jutil.fillinf(factor_df) factor_df = _mask_non_index_member(factor_df, index_member) return jutil.rank_with_mask(factor_df, axis=1, normalize=True)
def orthogonalize(factors_dict=None, standardize_type="z_score", winsorization=False, index_member=None): """ # 因子间存在较强同质性时,使用施密特正交化方法对因子做正交化处理,用得到的正交化残差作为因子 :param index_member: :param factors_dict: 若干因子组成的字典(dict),形式为: {"factor_name_1":factor_1,"factor_name_2":factor_2} 每个因子值格式为一个pd.DataFrame,索引(index)为date,column为asset :param standardize_type: 标准化方法,有"rank"(排序标准化),"z_score"(z-score标准化)两种("rank"/"z_score") :return: factors_dict(new) 正交化处理后所得的一系列新因子。 """ from scipy import linalg from functools import partial def Schmidt(data): return linalg.orth(data) def get_vector(date, factor): return factor.loc[date] if not factors_dict or len(list(factors_dict.keys())) < 2: raise ValueError("你需要给定至少2个因子") new_factors_dict = {} # 用于记录正交化后的因子值 for factor_name in factors_dict.keys(): new_factors_dict[factor_name] = [] # 处理非法值 factors_dict[factor_name] = jutil.fillinf(factors_dict[factor_name]) factors_dict[factor_name] = process._mask_non_index_member( factors_dict[factor_name], index_member=index_member) if winsorization: factors_dict[factor_name] = process.winsorize( factors_dict[factor_name]) factor_name_list = list(factors_dict.keys()) factor_value_list = list(factors_dict.values()) # 施密特正交 for date in factor_value_list[0].index: data = list(map(partial(get_vector, date), factor_value_list)) data = pd.concat(data, axis=1, join="inner") data = data.dropna() if len(data) == 0: continue data = pd.DataFrame(Schmidt(data), index=data.index) data.columns = factor_name_list for factor_name in factor_name_list: row = pd.DataFrame(data[factor_name]).T row.index = [ date, ] new_factors_dict[factor_name].append(row) # 因子标准化 for factor_name in factor_name_list: factor_value = pd.concat(new_factors_dict[factor_name]) # 恢复在正交化过程中剔除的行和列 factor_value = factor_value.reindex( index=factor_value_list[0].index, columns=factor_value_list[0].columns) if standardize_type == "z_score": new_factors_dict[factor_name] = process.standardize( factor_value, index_member) else: new_factors_dict[factor_name] = process.rank_standardize( factor_value, index_member) return new_factors_dict
def neutralize(factor_df, group, float_mv=None, index_member=None): """ 对因子做行业、市值中性化 :param index_member: :param group: 行业分类(pandas.Dataframe类型),index为datetime, colunms为股票代码 :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。 形如: AAPL BA CMG DAL LULU date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 :param float_mv: 流通市值因子(pandas.Dataframe类型),index为datetime, colunms为股票代码.为空则不进行市值中性化 :return: 中性化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。 """ def drop_nan(s): return s[s != "nan"] def _ols_by_numpy(x, y): m = np.linalg.lstsq(x, y)[0] resid = y - (x@m) return resid def _generate_cross_sectional_residual(data): for _, X in data.groupby(level=0): signal = X.pop("signal") X = pd.concat([X, pd.get_dummies(X.pop("industry"))], axis=1) signal = pd.Series(_ols_by_numpy(X.values, signal), index=signal.index, name=signal.name) yield signal data = [] # 用于恢复原先的索引和列 origin_factor_columns = factor_df.columns origin_factor_index = factor_df.index factor_df = jutil.fillinf(factor_df) # 调整非法值 factor_df = _mask_non_index_member(factor_df, index_member) # 剔除非指数成份股 factor_df = factor_df.dropna(how="all").stack().rename("signal") # 删除全为空的截面 data.append(factor_df) # 获取对数流动市值,并去极值、标准化。市值类因子不需进行这一步 if float_mv is not None: float_mv = standardize(mad(np.log(float_mv), index_member=index_member), index_member).stack().rename("style") data.append(float_mv) # 行业 industry_standard = drop_nan(group.stack()).rename("industry") data.append(industry_standard) data = pd.concat(data,axis=1).dropna() residuals = pd.concat(_generate_cross_sectional_residual(data)).unstack() # 恢复在中性化过程中剔除的行和列 residuals.reindex(index=origin_factor_index,columns=origin_factor_columns) return residuals.reindex(index=origin_factor_index,columns=origin_factor_columns)
def get_signal_data(self, signal): """ Returns ------- res : pd.DataFrame Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'upside_ret(N)','downside_ret(N)','quantile'] """ self._judge(signal) # 判断signal与其他关键参数是否格式一致 self._cal_ret() # 计算信号收益 signal = jutil.fillinf(signal) signal = signal.shift(1) # avoid forward-looking bias # forward or not if not self.forward: signal = signal.shift(self.period) # 处理mask mask = np.logical_or(self.mask, signal.isnull()) # calculate quantile signal_masked = signal.copy() signal_masked = signal_masked[~mask] if self.n_quantiles == 1: df_quantile = signal_masked.copy() df_quantile.loc[:, :] = 1.0 else: df_quantile = jutil.to_quantile(signal_masked, n_quantiles=self.n_quantiles) # ---------------------------------------------------------------------- # stack def stack_td_symbol(df): df = pd.DataFrame(df.stack(dropna=False)) # do not dropna df.index.names = ['trade_date', 'symbol'] df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True) return df # ---------------------------------------------------------------------- # concat signal value res = stack_td_symbol(signal) # 信号 res.columns = ['signal'] for ret_type in self.signal_ret.keys(): if self.signal_ret[ret_type] is not None: res[ret_type] = stack_td_symbol( self.signal_ret[ret_type]).fillna(0) # 收益 if self.group is not None: res["group"] = stack_td_symbol(self.group) res['quantile'] = stack_td_symbol(df_quantile) # quantile mask = stack_td_symbol(mask) res = res.loc[~(mask.iloc[:, 0]), :] if len(res) > 0: print("Nan Data Count (should be zero) : {:d}; " \ "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(), len(res) * 100. / signal.size)) else: print("No signal available.") res = res.astype({'signal': float, 'return': float, 'quantile': int}) return res
def neutralize(factor_df, group, float_mv=None, index_member=None): """ 对因子做行业、市值中性化 :param index_member: :param group: 行业分类(pandas.Dataframe类型),index为datetime, colunms为股票代码 :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。 形如: AAPL BA CMG DAL LULU date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 :param float_mv: 流通市值因子(pandas.Dataframe类型),index为datetime, colunms为股票代码.为空则不进行市值中性化 :return: 中性化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。 """ assert np.all(factor_df.index == group.index) assert np.all(factor_df.columns == group.columns) # 获取对数流动市值,并去极值、标准化。市值类因子不需进行这一步 if float_mv is not None: assert np.all(factor_df.index == float_mv.index) assert np.all(factor_df.columns == float_mv.columns) x1 = standardize( winsorize(np.log(float_mv), index_member=index_member), index_member) factor_df = jutil.fillinf(factor_df) factor_df = _mask_non_index_member(factor_df, index_member) # 剔除非指数成份股 factor_df = factor_df.dropna(how="all") # 删除全为空的截面 result = [] # 逐个截面进行回归,留残差作为中性化后的因子值 for i in factor_df.index: # 获取行业分类信息 X = pd.get_dummies(group.loc[i, :].dropna()) if float_mv is not None: nfactors = len(X.columns) + 1 DataAll = pd.concat([X, x1.loc[i], factor_df.loc[i]], axis=1) else: nfactors = len(X.columns) DataAll = pd.concat([X, factor_df.loc[i]], axis=1) # 剔除截面中值含空的股票 DataAll = DataAll.dropna() if len(DataAll) == 0: continue DataAll.columns = list(range(0, nfactors + 1)) regr = linear_model.LinearRegression(fit_intercept=False) regr.fit(np.matrix(DataAll.iloc[:, 0:nfactors]), np.transpose(np.matrix(DataAll.iloc[:, nfactors]))) residuals = np.transpose(np.matrix( DataAll.iloc[:, nfactors])) - regr.predict( np.matrix(DataAll.iloc[:, 0:nfactors])) residuals = pd.DataFrame(data=residuals, index=np.transpose( np.matrix(DataAll.index.values))) residuals.index = DataAll.index.values residuals.columns = [i] result.append(residuals) # 合并回归结果,恢复在中性化过程中剔除的行和列 result = pd.concat(result, axis=1).reindex(factor_df.columns).T result = result.reindex(factor_df.index) return result
def process_signal_before_analysis(self, signal, price=None, daily_ret=None, benchmark_price=None, daily_benchmark_ret=None, high=None, low=None, group=None, period=5, n_quantiles=5, mask=None, can_enter=None, can_exit=None, forward=True, commission=0.0008): """ Prepare for signal analysis. Parameters ---------- signal : pd.DataFrame Index is date, columns are stocks. price : pd.DataFrame Index is date, columns are stocks. high : pd.DataFrame Index is date, columns are stocks. low : pd.DataFrame Index is date, columns are stocks. daily_ret : pd.DataFrame Index is date, columns are stocks. daily_benchmark_ret : pd.DataFrame or pd.Series or None Daily ret of benchmark. group : pd.DataFrame Index is date, columns are stocks. benchmark_price : pd.DataFrame or pd.Series or None Price of benchmark. mask : pd.DataFrame Data cells that should NOT be used. can_enter: pd.DataFrame Date the security can be traded and BUY. can_exit:pd.DataFrame Date the security can be traded and SELL. n_quantiles : int period : int periods to compute forward returns on. forward :bool Return cal method. True by default. commission: float commission ratio per trade. Returns ------- res : pd.DataFrame Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'upside_ret(N)','downside_ret(N)','quantile'] """ """ Deal with suspensions: If the period of calculating return is d (from T to T+d), then we do not use signal values of those suspended on T, we do not calculate return for those suspended on T+d. """ # ---------------------------------------------------------------------- # parameter validation if price is None and daily_ret is None: raise ValueError("One of price / daily_ret must be provided.") if price is not None and daily_ret is not None: raise ValueError( "Only one of price / daily_ret should be provided.") if benchmark_price is not None and daily_benchmark_ret is not None: raise ValueError( "Only one of benchmark_price / daily_benchmark_ret should be provided." ) if not (n_quantiles > 0 and isinstance(n_quantiles, int)): raise ValueError( "n_quantiles must be a positive integer. Input is: {}".format( n_quantiles)) if daily_ret is not None: warnings.warn( "Warning: 检查到使用daily_ret模式。未避免未来函数,请注意确保daily_ret格式为对应日期能实现的日收益." ) # ensure inputs are aligned if mask is not None: try: assert np.all(signal.index == mask.index) assert np.all(signal.columns == mask.columns) except: warnings.warn("Warning: signal与mask的index/columns不一致,请检查输入参数!") mask = mask.reindex_like(signal) mask = jutil.fillinf(mask) mask = mask.astype(int).fillna(0).astype( bool) # dtype of mask could be float. So we need to convert. else: mask = pd.DataFrame(index=signal.index, columns=signal.columns, data=False) if can_enter is not None: try: assert np.all(signal.index == can_enter.index) assert np.all(signal.columns == can_enter.columns) except: warnings.warn( "Warning: signal与can_enter的index/columns不一致,请检查输入参数!") can_enter = can_enter.reindex_like(signal) can_enter = jutil.fillinf(can_enter) can_enter = can_enter.astype(int).fillna(0).astype( bool ) # dtype of can_enter could be float. So we need to convert. else: can_enter = pd.DataFrame(index=signal.index, columns=signal.columns, data=True) if can_exit is not None: try: assert np.all(signal.index == can_exit.index) assert np.all(signal.columns == can_exit.columns) except: warnings.warn( "Warning: signal与can_exit的index/columns不一致,请检查输入参数!") can_exit = can_exit.reindex_like(signal) can_exit = jutil.fillinf(can_exit) can_exit = can_exit.astype(int).fillna(0).astype( bool ) # dtype of can_exit could be float. So we need to convert. else: can_exit = pd.DataFrame(index=signal.index, columns=signal.columns, data=True) if group is not None: try: assert np.all(signal.index == group.index) assert np.all(signal.columns == group.columns) except: warnings.warn( "Warning: signal与group的index/columns不一致,请检查输入参数!") group = group.reindex_like(signal) group = group.astype(str) # ---------------------------------------------------------------------- # save data self.n_quantiles = n_quantiles self.period = period # ---------------------------------------------------------------------- # Get dependent variables # 计算benchmark收益 self.benchmark_ret = None if benchmark_price is not None: benchmark_price = benchmark_price.reindex(index=signal.index) self.benchmark_ret = pfm.price2ret(benchmark_price, self.period, axis=0, compound=True) elif daily_benchmark_ret is not None: daily_benchmark_ret = daily_benchmark_ret.reindex( index=signal.index) self.benchmark_ret = pfm.daily_ret_to_ret(daily_benchmark_ret, self.period) # 计算区间持仓收益 isRealPrice = False if daily_ret is not None: try: assert np.all(signal.index == daily_ret.index) assert np.all(signal.columns == daily_ret.columns) except: warnings.warn( "Warning: signal与daily_ret的index/columns不一致,请检查输入参数!") daily_ret = daily_ret.reindex_like(signal) daily_ret = jutil.fillinf(daily_ret).fillna(0) price = pfm.daily_ret_to_cum(daily_ret) else: # 有price isRealPrice = True try: assert np.all(signal.index == price.index) assert np.all(signal.columns == price.columns) except: warnings.warn( "Warning: signal与price的index/columns不一致,请检查输入参数!") price = price.reindex_like(signal) price = jutil.fillinf(price) can_enter = np.logical_and(price != np.NaN, can_enter) df_ret = pfm.price2ret(price, period=self.period, axis=0, compound=True) price_can_exit = price.copy() price_can_exit[~can_exit] = np.NaN price_can_exit = price_can_exit.fillna(method="bfill") ret_can_exit = pfm.price2ret(price_can_exit, period=self.period, axis=0, compound=True) df_ret[~can_exit] = ret_can_exit[~can_exit] if self.benchmark_ret is not None: # 计算持有期相对收益 residual_ret = df_ret.sub(self.benchmark_ret.values.flatten(), axis=0) else: residual_ret = df_ret residual_ret = jutil.fillinf(residual_ret) residual_ret -= commission # 计算潜在上涨空间和潜在下跌空间 if high is not None and isRealPrice: try: assert np.all(signal.index == high.index) assert np.all(signal.columns == high.columns) except: warnings.warn("Warning: signal与high的index/columns不一致,请检查输入参数!") high = high.reindex_like(signal) high = jutil.fillinf(high) else: high = price upside_ret = compute_upside_returns(price, high, can_exit, self.period, compound=True) upside_ret = jutil.fillinf(upside_ret) upside_ret -= commission if low is not None and isRealPrice: try: assert np.all(signal.index == low.index) assert np.all(signal.columns == low.columns) except: warnings.warn("Warning: signal与low的index/columns不一致,请检查输入参数!") low = low.reindex_like(signal) low = jutil.fillinf(low) else: low = price downside_ret = compute_downside_returns(price, low, can_exit, self.period, compound=True) downside_ret = jutil.fillinf(downside_ret) downside_ret -= commission # ---------------------------------------------------------------------- # Get independent varibale signal = jutil.fillinf(signal) signal = signal.shift(1) # avoid forward-looking bias # forward or not if forward: # point-in-time signal and forward return residual_ret = residual_ret.shift(-self.period) upside_ret = upside_ret.shift(-self.period) downside_ret = downside_ret.shift(-self.period) else: # past signal and point-in-time return signal = signal.shift(self.period) can_enter = can_enter.shift(self.period) mask = mask.shift(self.period) self.ret = dict() self.ret["return"] = residual_ret self.ret["upside_ret"] = upside_ret self.ret["downside_ret"] = downside_ret # ---------------------------------------------------------------------- # get masks # mask_prices = data.isnull() # Because we use FORWARD return, if one day's price is broken, the day that is <period> days ago is also broken. # mask_prices = np.logical_or(mask_prices, mask_prices.shift(self.period)) # mask_price_return = residual_ret.isnull() mask_signal = signal.isnull() mask = np.logical_or( mask.fillna(True), np.logical_or(mask_signal, ~(can_enter.fillna(False)))) mask = np.logical_or(mask, self.ret["return"].isnull()) # mask = np.logical_or(mask, mask_signal) # if price is not None: # mask_forward = np.logical_or(mask, mask.shift(self.period).fillna(True)) # mask = np.logical_or(mask, mask_forward) # ---------------------------------------------------------------------- # calculate quantile signal_masked = signal.copy() signal_masked = signal_masked[~mask] if n_quantiles == 1: df_quantile = signal_masked.copy() df_quantile.loc[:, :] = 1.0 else: if group is None: df_quantile = jutil.to_quantile(signal_masked, n_quantiles=n_quantiles) else: from jaqs_fxdayu.data.py_expression_eval import Parser ps = Parser() ps.index_member = None df_quantile = ps.group_quantile(df=signal_masked, group=group, n_quantiles=n_quantiles) # ---------------------------------------------------------------------- # stack def stack_td_symbol(df): df = pd.DataFrame(df.stack(dropna=False)) # do not dropna df.index.names = ['trade_date', 'symbol'] df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True) return df # ---------------------------------------------------------------------- # concat signal value res = stack_td_symbol(signal) res.columns = ['signal'] for ret_type in self.ret.keys(): res[ret_type] = stack_td_symbol(self.ret[ret_type]).fillna(0) res['quantile'] = stack_td_symbol(df_quantile) if group is not None: res["group"] = stack_td_symbol(group) mask = stack_td_symbol(mask) res = res.loc[~(mask.iloc[:, 0]), :] if len(res) > 0: print("Nan Data Count (should be zero) : {:d}; " \ "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(), len(res) * 100. / signal.size)) else: print("No signal available.") res = res.astype({'signal': float, 'return': float, 'quantile': int}) self.signal_data = res
def process_signal(self, enter_signal, exit_signal=None, sig_type="long", price=None, daily_ret=None, max_holding_period=None, stoploss=None, stopprofit=None, mask=None, can_enter=None, can_exit=None, group=None, n_quantiles=1, commission=0.0008): """ Prepare for signal analysis. Parameters ---------- enter_signal : pd.DataFrame Index is date, columns are stocks.value can only be -2/0/2 exit_signal : pd.DataFrame/list of pd.DataFrame Index is date, columns are stocks.value can only be -1/0/1 sig_type: str "long"/"short", which type of signal to process price : pd.DataFrame Index is date, columns are stocks. daily_ret : pd.DataFrame Index is date, columns are stocks. mask : pd.DataFrame Data cells that should NOT be used. can_enter: pd.DataFrame Date the security can open. can_exit:pd.DataFrame Date the security can close. max_holding_period : int Limit the max holding period stoploss:float stoploss ratio per trade stopprofit:float stopprofit ratio per trade n_quantiles: int group : pd.DataFrame Index is date, columns are stocks. commission: float commission ratio per trade. Returns ------- res : pd.DataFrame Signal processed """ # ensure inputs are aligned # parameter validation if sig_type not in ["long", "short"]: raise ValueError("信号类型(sig_type)只能为long/short.") if price is None and daily_ret is None: raise ValueError("One of price / daily_ret must be provided.") if price is not None and daily_ret is not None: raise ValueError( "Only one of price / daily_ret should be provided.") if not (n_quantiles > 0 and isinstance(n_quantiles, int)): raise ValueError( "n_quantiles must be a positive integer. Input is: {}".format( n_quantiles)) enter_signal = jutil.fillinf(enter_signal) if n_quantiles == 1: # 事件类进场信号 # 确保enter_signal里的信号只能为-2(开空),0(不做操作),2(开多) enter_signal = enter_signal.fillna(0) if not enter_signal.isin([-2, 0, 2]).all().all(): raise ValueError("检测到n_quantiles为1,该模式下测试的enter_signal为事件类因子." "请确保enter_signal里的信号只能为-2(开空),0(不做操作),2(开多))." "如需测试普通因子,请指定n_quantiles为大于1的整数.") # 确保至少有一种出场信号 if (exit_signal is None) and (max_holding_period is None) and \ (stoploss is None) and (stopprofit is None): raise ValueError( "确保至少有一种出场信号(exit_signal/max_holding_period/stoploss/stopprofit)" ) else: # 普通进场信号 if max_holding_period is None: raise ValueError("检测到n_quantiles不为1,该模式下测试的enter_signal为普通因子." "该模式下,max_holding_period参数不能为空.") self.period = max_holding_period if exit_signal is not None: # 确保exit_signal里的信号只能为-1(平空),0(不做操作),1(平多) if not isinstance(exit_signal, list): exit_signal = [exit_signal] for i in range(len(exit_signal)): exit_signal[i] = exit_signal[i].reindex_like(enter_signal) exit_signal[i] = jutil.fillinf(exit_signal[i]).fillna(0) if not exit_signal[i].isin([-1, 0, 1]).all().all(): raise ValueError( "请确保所有exit_signal里的信号只能为-1(平空),0(不做操作),1(平多)") else: exit_signal = [] if group is not None: group = group.reindex_like(enter_signal) sig_filter = { "mask": mask, "can_enter": can_enter, "can_exit": can_exit, } for _filter in sig_filter.keys(): if sig_filter[_filter] is not None: sig_filter[_filter] = sig_filter[_filter].reindex_like( enter_signal) sig_filter[_filter] = jutil.fillinf( sig_filter[_filter]).astype(int).fillna(0) else: sig_filter[_filter] = pd.DataFrame( index=enter_signal.index, columns=enter_signal.columns, data=0 if _filter == "mask" else 1) # process #============================================================= # 信号在当天的收盘时候统计,具体执行则在下一天的交易日的开盘--设置price=open, # 或下一天交易日的收盘--设置price=close,或别的价格--如设置price=vwap # 防止未来函数 enter_signal = enter_signal.shift(1) for i in range(len(exit_signal)): exit_signal[i] = exit_signal[i].shift(1) # 处理价格数据 if daily_ret is not None: daily_ret = daily_ret.reindex_like(enter_signal) daily_ret = jutil.fillinf(daily_ret).fillna(0) price = pfm.daily_ret_to_cum(daily_ret) # 取净值 else: # 有price price = price.reindex_like(enter_signal) price = jutil.fillinf(price) # 取价格 self.price = price #===================== # 调整出场点 pos = [] # 定时出场位置 if max_holding_period is not None: pos.append( get_period_exit_pos(enter_signal, period=max_holding_period)) # 止损出场位置 if stoploss is not None: pos.append( get_stop_pos(price, stoploss, sig_type=sig_type, stop_type="stop_loss")) # 止盈出场位置 if stopprofit is not None: pos.append( get_stop_pos(price, stopprofit, sig_type=sig_type, stop_type="stop_profit")) # 自定义出场信号位置 for es in exit_signal: pos.append(get_exit_pos(es, exit_type="close_%s" % (sig_type, ))) # 综合了各种出场条件,选择最先触发的出场条件出场 exit_pos = reduce(get_first_pos, pos).replace(LONGINT, np.nan) # 每天允许出场的最近的出场点 exit_permited_pos = get_exit_pos(sig_filter["can_exit"], value=[1]) self.final_exit_pos[sig_type] = get_exit_value(exit_permited_pos, exit_pos) # ===================== # 计算信号收益 price_exit = get_exit_value(price, self.final_exit_pos[sig_type]) ret_exit = jutil.fillinf((price_exit - price) / price) if sig_type == "short": ret_exit = -1 * ret_exit self.ret[sig_type] = ret_exit - commission # ===================== # 计算signal_data # ---------------------------------------------------------------------- # mask signal if n_quantiles == 1: # 事件因子 if sig_type == "long": value = 2 else: value = -2 mask_signal = enter_signal != value else: # 普通因子 mask_signal = enter_signal.isnull() mask_signal = np.logical_or( mask_signal, np.logical_or(sig_filter["mask"], sig_filter["can_enter"] != 1)) mask_signal = np.logical_or(mask_signal, self.ret[sig_type].isnull()) # ban掉出场信号在进场那天的 # get sig pos sig_pos = get_sig_pos(self.final_exit_pos[sig_type]) mask_signal = np.logical_or(mask_signal, sig_pos == self.final_exit_pos[sig_type]) # calculate quantile if n_quantiles == 1: df_quantile = pd.DataFrame(1, index=enter_signal.index, columns=enter_signal.columns) else: signal_masked = enter_signal.copy() signal_masked = signal_masked[~mask_signal] if group is None: df_quantile = jutil.to_quantile(signal_masked, n_quantiles=n_quantiles) else: from jaqs_fxdayu.data.py_expression_eval import Parser ps = Parser() ps.index_member = None df_quantile = ps.group_quantile(df=signal_masked, group=group, n_quantiles=n_quantiles) # ---------------------------------------------------------------------- # concat signal value res = stack_td_symbol(enter_signal) res.columns = ['signal'] res["return"] = stack_td_symbol(self.ret[sig_type]) res["exit_time"] = stack_td_symbol(self.final_exit_pos[sig_type]) res['quantile'] = stack_td_symbol(df_quantile) if group is not None: res["group"] = stack_td_symbol(group) res["sig_type"] = sig_type mask_signal = stack_td_symbol(mask_signal) res = res.loc[~(mask_signal.iloc[:, 0]), :] if len(res) > 0: print("Nan Data Count (should be zero) : {:d}; " \ "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(), len(res) * 100. / enter_signal.size)) res = res.astype({ 'signal': float, 'return': float, 'quantile': int }) self.signal_data[sig_type] = res else: print("sig_type %s:No signal available." % (sig_type, ))
def process_signal_before_analysis(self, signal, price=None, ret=None, benchmark_price=None, period=5, n_quantiles=5, mask=None, forward=False): """ Prepare for signal analysis. Parameters ---------- signal : pd.DataFrame Index is date, columns are stocks. price : pd.DataFrame Index is date, columns are stocks. ret : pd.DataFrame Index is date, columns are stocks. benchmark_price : pd.DataFrame or pd.Series or None Price of benchmark. mask : pd.DataFrame Data cells that should NOT be used. n_quantiles : int period : int periods to compute forward returns on. Returns ------- res : pd.DataFrame Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'quantile'] """ """ Deal with suspensions: If the period of calculating return is d (from T to T+d), then we do not use signal values of those suspended on T, we do not calculate return for those suspended on T+d. """ # ---------------------------------------------------------------------- # parameter validation if price is None and ret is None: raise ValueError("One of price / ret must be provided.") if price is not None and ret is not None: raise ValueError("Only one of price / ret should be provided.") if ret is not None and benchmark_price is not None: raise ValueError("You choose 'return' mode but benchmark_price is given.") if not (n_quantiles > 0 and isinstance(n_quantiles, int)): raise ValueError("n_quantiles must be a positive integer. Input is: {}".format(n_quantiles)) # ensure inputs are aligned data = price if price is not None else ret assert np.all(signal.index == data.index) assert np.all(signal.columns == data.columns) if mask is not None: assert np.all(signal.index == mask.index) assert np.all(signal.columns == mask.columns) mask = jutil.fillinf(mask) mask = mask.astype(int).fillna(0).astype(bool) # dtype of mask could be float. So we need to convert. else: mask = pd.DataFrame(index=signal.index, columns=signal.columns, data=False) signal = jutil.fillinf(signal) data = jutil.fillinf(data) # ---------------------------------------------------------------------- # save data self.n_quantiles = n_quantiles self.period = period # ---------------------------------------------------------------------- # Get dependent variables if price is not None: df_ret = pfm.price2ret(price, period=self.period, axis=0) if benchmark_price is not None: benchmark_price = benchmark_price.loc[signal.index] bench_ret = pfm.price2ret(benchmark_price, self.period, axis=0) self.benchmark_ret = bench_ret residual_ret = df_ret.sub(bench_ret.values.flatten(), axis=0) else: residual_ret = df_ret else: residual_ret = ret # Get independent varibale signal = signal.shift(1) # avoid forward-looking bias # forward or not if forward: # point-in-time signal and forward return residual_ret = residual_ret.shift(-self.period) else: # past signal and point-in-time return signal = signal.shift(self.period) # ---------------------------------------------------------------------- # get masks # mask_prices = data.isnull() # Because we use FORWARD return, if one day's price is broken, the day that is <period> days ago is also broken. # mask_prices = np.logical_or(mask_prices, mask_prices.shift(self.period)) mask_price_return = residual_ret.isnull() mask_signal = signal.isnull() mask = np.logical_or(mask_signal, mask_price_return) # mask = np.logical_or(mask, mask_signal) # if price is not None: # mask_forward = np.logical_or(mask, mask.shift(self.period).fillna(True)) # mask = np.logical_or(mask, mask_forward) # ---------------------------------------------------------------------- # calculate quantile signal_masked = signal.copy() signal_masked = signal_masked[~mask] if n_quantiles == 1: df_quantile = signal_masked.copy() df_quantile.loc[:, :] = 1.0 else: df_quantile = jutil.to_quantile(signal_masked, n_quantiles=n_quantiles) # ---------------------------------------------------------------------- # stack def stack_td_symbol(df): df = pd.DataFrame(df.stack(dropna=False)) # do not dropna df.index.names = ['trade_date', 'symbol'] df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True) return df mask = stack_td_symbol(mask) df_quantile = stack_td_symbol(df_quantile) residual_ret = stack_td_symbol(residual_ret) # ---------------------------------------------------------------------- # concat signal value res = stack_td_symbol(signal) res.columns = ['signal'] res['return'] = residual_ret res['quantile'] = df_quantile res = res.loc[~(mask.iloc[:, 0]), :] print("Nan Data Count (should be zero) : {:d}; " \ "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(), len(res) * 100. / signal.size)) res = res.astype({'signal': float, 'return': float, 'quantile': int}) self.signal_data = res
def get_factors_ret_df(factors_dict, price, high=None, low=None, group=None, benchmark_price=None, period=5, quantiles=5, mask=None, can_enter=None, can_exit=None, commission=0.0008, forward=True, ret_type="return", **kwargs): """ 获取多个因子收益序列矩阵 :param factors_dict: 若干因子组成的字典(dict),形式为: {"factor_name_1":factor_1,"factor_name_2":factor_2} :param period: 指定持有周期(int) :param quantiles: 根据因子大小将股票池划分的分位数量(int) :param price : 包含了pool中所有股票的价格时间序列(pd.Dataframe),索引(index)为datetime,columns为各股票代码,与pool对应。 :param benchmark_price:基准收益,不为空计算相对收益,否则计算绝对收益 :return: ret_df 多个因子收益序列矩阵 类型pd.Dataframe,索引(index)为datetime,columns为各因子名称,与factors_dict中的对应。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 """ def stack_td_symbol(df): df = pd.DataFrame(df.stack(dropna=False)) # do not dropna df.index.names = ['trade_date', 'symbol'] df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True) return df def get_regression_result(df): ret = df.pop("return") if "group" in df.columns: df = df.drop("group", axis=1) ols_model = sm.OLS(ret, df) regression_results = ols_model.fit() return regression_results.params if ret_type is None: ret_type = "return" if not (ret_type in ["return", "upside_ret", "downside_ret"]): raise ValueError( "不支持对%s收益的ic计算!支持的收益类型有return, upside_ret, downside_ret." % (ret_type, )) sc = SignalCreator(price, high=high, low=low, group=group, benchmark_price=benchmark_price, period=period, n_quantiles=quantiles, mask=mask, can_enter=can_enter, can_exit=can_exit, forward=forward, commission=commission) res = None # 获取factor_value的时间(index),将用来生成 factors_ic_df 的对应时间(index) times = sorted( pd.concat([ pd.Series(factors_dict[factor_name].index) for factor_name in factors_dict.keys() ]).unique()) for factor_name in factors_dict.keys(): signal = factors_dict[factor_name] if (not isinstance(signal, pd.DataFrame)) or (signal.size == 0): raise ValueError("因子%s为空或不合法!请确保传入因子有值且数据类型为pandas.DataFrame." % (factor_name, )) sc._judge(signal) sc._cal_ret() if ret_type not in sc.signal_ret.keys(): raise ValueError("无法计算%s收益,请重新设置输入参数." % (ret_type, )) if res is None: res = stack_td_symbol(sc.signal_ret[ret_type]).fillna(0) res.columns = ["return"] signal = jutil.fillinf(signal) signal = signal.shift(1) # avoid forward-looking bias if not forward: signal = signal.shift(period) res[factor_name] = stack_td_symbol(signal) grouper = ['trade_date'] if group is not None: res["group"] = stack_td_symbol(group) grouper.append('group') res = res.dropna() result = res.groupby(grouper).apply(get_regression_result) if group is None: result = result.dropna(how="all").reindex(times) else: result = result.dropna(how="all") result = result.reindex( pd.MultiIndex.from_product([times, result.index.levels[1]], names=["trade_date", "group"])) return result
def _cal_ret(self): if self.signal_ret is not None: return else: # 计算benchmark收益 if self.benchmark_price is not None: self.benchmark_ret = pfm.price2ret(self.benchmark_price, self.period, axis=0, compound=True) elif self.daily_benchmark_ret is not None: self.benchmark_ret = pfm.daily_ret_to_ret( self.daily_benchmark_ret, self.period) # 计算区间持仓收益 isRealPrice = False if self.daily_ret is not None: self.daily_ret = jutil.fillinf(self.daily_ret).fillna(0) self.price = pfm.daily_ret_to_cum(self.daily_ret) else: # 有price isRealPrice = True self.price = jutil.fillinf(self.price) self.can_enter = np.logical_and(self.price != np.NaN, self.can_enter) df_ret = pfm.price2ret(self.price, period=self.period, axis=0, compound=True) price_can_exit = self.price.copy() price_can_exit[~self.can_exit] = np.NaN price_can_exit = price_can_exit.fillna(method="bfill") ret_can_exit = pfm.price2ret(price_can_exit, period=self.period, axis=0, compound=True) df_ret[~self.can_exit] = ret_can_exit[~self.can_exit] if self.benchmark_ret is not None: # 计算持有期相对收益 self.benchmark_ret = self.benchmark_ret.reindex(df_ret.index) residual_ret = df_ret.sub(self.benchmark_ret.values.flatten(), axis=0) else: residual_ret = df_ret residual_ret = jutil.fillinf(residual_ret) residual_ret -= self.commission # 计算潜在上涨空间和潜在下跌空间 if self.high is not None and isRealPrice: self.high = jutil.fillinf(self.high) else: self.high = self.price upside_ret = compute_upside_returns(self.price, self.high, self.can_exit, self.period, compound=True) upside_ret = jutil.fillinf(upside_ret) upside_ret -= self.commission if self.low is not None and isRealPrice: self.low = jutil.fillinf(self.low) else: self.low = self.price downside_ret = compute_downside_returns(self.price, self.low, self.can_exit, self.period, compound=True) downside_ret = jutil.fillinf(downside_ret) downside_ret -= self.commission self.signal_ret = { "return": residual_ret, "upside_ret": upside_ret, "downside_ret": downside_ret } if self.forward: for ret_type in self.signal_ret.keys(): if self.signal_ret[ret_type] is not None: # point-in-time signal and forward return self.signal_ret[ret_type] = self.signal_ret[ ret_type].shift(-self.period) else: self.can_enter = self.can_enter.shift(self.period) self.mask = self.mask.shift(self.period) # 处理mask self.mask = np.logical_or(self.mask.fillna(True), ~(self.can_enter.fillna(False)))
def process_signal_before_analysis(self, signal, price=None, ret=None, benchmark_price=None, period=5, n_quantiles=5, mask=None, forward=False): """ Prepare for signal analysis. Parameters ---------- signal : pd.DataFrame Index is date, columns are stocks. price : pd.DataFrame Index is date, columns are stocks. ret : pd.DataFrame Index is date, columns are stocks. benchmark_price : pd.DataFrame or pd.Series or None Price of benchmark. mask : pd.DataFrame Data cells that should NOT be used. n_quantiles : int period : int periods to compute forward returns on. Returns ------- res : pd.DataFrame Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'quantile'] """ """ Deal with suspensions: If the period of calculating return is d (from T to T+d), then we do not use signal values of those suspended on T, we do not calculate return for those suspended on T+d. """ # ---------------------------------------------------------------------- # parameter validation if price is None and ret is None: raise ValueError("One of price / ret must be provided.") if price is not None and ret is not None: raise ValueError("Only one of price / ret should be provided.") if ret is not None and benchmark_price is not None: raise ValueError( "You choose 'return' mode but benchmark_price is given.") if not (n_quantiles > 0 and isinstance(n_quantiles, int)): raise ValueError( "n_quantiles must be a positive integer. Input is: {}".format( n_quantiles)) # ensure inputs are aligned data = price if price is not None else ret assert np.all(signal.index == data.index) assert np.all(signal.columns == data.columns) if mask is not None: assert np.all(signal.index == mask.index) assert np.all(signal.columns == mask.columns) mask = jutil.fillinf(mask) mask = mask.astype(int).fillna(0).astype( bool) # dtype of mask could be float. So we need to convert. else: mask = pd.DataFrame(index=signal.index, columns=signal.columns, data=False) signal = jutil.fillinf(signal) data = jutil.fillinf(data) # ---------------------------------------------------------------------- # save data self.n_quantiles = n_quantiles self.period = period # ---------------------------------------------------------------------- # Get dependent variables if price is not None: df_ret = pfm.price2ret(price, period=self.period, axis=0) if benchmark_price is not None: benchmark_price = benchmark_price.loc[signal.index] bench_ret = pfm.price2ret(benchmark_price, self.period, axis=0) self.benchmark_ret = bench_ret residual_ret = df_ret.sub(bench_ret.values.flatten(), axis=0) else: residual_ret = df_ret else: residual_ret = ret # Get independent varibale signal = signal.shift(1) # avoid forward-looking bias # forward or not if forward: # point-in-time signal and forward return residual_ret = residual_ret.shift(-self.period) else: # past signal and point-in-time return signal = signal.shift(self.period) # ---------------------------------------------------------------------- # get masks # mask_prices = data.isnull() # Because we use FORWARD return, if one day's price is broken, the day that is <period> days ago is also broken. # mask_prices = np.logical_or(mask_prices, mask_prices.shift(self.period)) mask_price_return = residual_ret.isnull() mask_signal = signal.isnull() mask = np.logical_or(mask_signal, mask_price_return) # mask = np.logical_or(mask, mask_signal) # if price is not None: # mask_forward = np.logical_or(mask, mask.shift(self.period).fillna(True)) # mask = np.logical_or(mask, mask_forward) # ---------------------------------------------------------------------- # calculate quantile signal_masked = signal.copy() signal_masked = signal_masked[~mask] if n_quantiles == 1: df_quantile = signal_masked.copy() df_quantile.loc[:, :] = 1.0 else: df_quantile = jutil.to_quantile(signal_masked, n_quantiles=n_quantiles) # ---------------------------------------------------------------------- # stack def stack_td_symbol(df): df = pd.DataFrame(df.stack(dropna=False)) # do not dropna df.index.names = ['trade_date', 'symbol'] df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True) return df mask = stack_td_symbol(mask) df_quantile = stack_td_symbol(df_quantile) residual_ret = stack_td_symbol(residual_ret) # ---------------------------------------------------------------------- # concat signal value res = stack_td_symbol(signal) res.columns = ['signal'] res['return'] = residual_ret res['quantile'] = df_quantile res = res.loc[~(mask.iloc[:, 0]), :] print("Nan Data Count (should be zero) : {:d}; " \ "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(), len(res) * 100. / signal.size)) res = res.astype({'signal': float, 'return': float, 'quantile': int}) self.signal_data = res
def _process_filter(_filter): if _filter is not None: _filter = jutil.fillinf(_filter) _filter = _filter.astype(int).fillna(0).astype(bool) return _filter
def get_factors_ic_df(factors_dict, price, high=None, low=None, group=None, benchmark_price=None, period=5, quantiles=5, mask=None, can_enter=None, can_exit=None, commisson=0.0008, forward=True, ret_type="return", **kwargs): """ 获取多个因子ic值序列矩阵 :param factors_dict: 若干因子组成的字典(dict),形式为: {"factor_name_1":factor_1,"factor_name_2":factor_2} :param pool: 股票池范围(list),如:["000001.SH","600300.SH",......] :param start: 起始时间 (int) :param end: 结束时间 (int) :param period: 指定持有周期(int) :param quantiles: 根据因子大小将股票池划分的分位数量(int) :param price : 包含了pool中所有股票的价格时间序列(pd.Dataframe),索引(index)为datetime,columns为各股票代码,与pool对应。 :param benchmark_price:基准收益,不为空计算相对收益,否则计算绝对收益 :return: ic_df 多个因子ic值序列矩阵 类型pd.Dataframe,索引(index)为datetime,columns为各因子名称,与factors_dict中的对应。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 """ if ret_type is None: ret_type = "return" if not (ret_type in ["return", "upside_ret", "downside_ret"]): raise ValueError("不支持对%s收益的ic计算!支持的收益类型有return, upside_ret, downside_ret." % (ret_type,)) ic_table = [] sc = SignalCreator( price, high=high, low=low, group=group, benchmark_price=benchmark_price, period=period, n_quantiles=quantiles, mask=mask, can_enter=can_enter, can_exit=can_exit, forward=forward, commission=commisson ) # 获取factor_value的时间(index),将用来生成 factors_ic_df 的对应时间(index) times = sorted( pd.concat([pd.Series(factors_dict[factor_name].index) for factor_name in factors_dict.keys()]).unique()) for factor_name in factors_dict.keys(): factors_dict[factor_name] = jutil.fillinf(factors_dict[factor_name]) factor_value = factors_dict[factor_name] signal_data = sc.get_signal_data(factor_value) if ret_type in signal_data.columns: origin_fields = ["signal", ret_type] new_fields = ["signal", "return"] if group is not None: origin_fields.append("group") new_fields.append("group") signal_data = signal_data[origin_fields] signal_data.columns = new_fields ic = pd.DataFrame(pfm.calc_signal_ic(signal_data, group is not None)) ic.columns = [factor_name, ] ic_table.append(ic) else: raise ValueError("signal_data中不包含%s收益,无法进行ic计算!" % (ret_type,)) if group is None: ic_df = pd.concat(ic_table, axis=1).dropna(how="all").reindex(times) else: ic_df = pd.concat(ic_table, axis=1).dropna(how="all") ic_df = ic_df.reindex(pd.MultiIndex.from_product([times,ic_df.index.levels[1]], names=["trade_date","group"])) return ic_df