Beispiel #1
0
    def __init__(self,
                 price=None,
                 ret=None,
                 high=None,
                 low=None,
                 group=None,
                 n_quantiles=5,
                 mask=None,
                 can_enter=None,
                 can_exit=None,
                 period=5,
                 benchmark_price=None,
                 forward=True,
                 commission=0.0008):

        if price is None and ret is None:
            raise ValueError("One of price / ret must be provided.")
        if price is not None and ret is not None:
            raise ValueError("Only one of price / ret should be provided.")
        if ret is not None and benchmark_price is not None:
            raise ValueError(
                "You choose 'return' mode but benchmark_price is given.")
        if not (n_quantiles > 0 and isinstance(n_quantiles, int)):
            raise ValueError(
                "n_quantiles must be a positive integer. Input is: {}".format(
                    n_quantiles))

        self.price = price
        self.ret = ret
        self.high = high
        self.low = low
        self.group = group
        self.n_quantiles = n_quantiles

        if mask is not None:
            mask = jutil.fillinf(mask)
            mask = mask.astype(int).fillna(0).astype(bool)
        self.mask = mask

        if can_enter is not None:
            can_enter = jutil.fillinf(can_enter)
            can_enter = can_enter.astype(int).fillna(0).astype(bool)
        self.can_enter = can_enter

        if can_exit is not None:
            can_exit = jutil.fillinf(can_exit)
            can_exit = can_exit.astype(int).fillna(0).astype(bool)
        self.can_exit = can_exit

        self.period = period
        self.benchmark_price = benchmark_price
        self.forward = forward
        self.commission = commission

        self.signal_data = None
        self.signal_ret = None
Beispiel #2
0
def mad(factor_df, index_member=None):
    """
    对因子值做去极值操作
    :param index_member:
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	        BA	   CMG	     DAL	      LULU	  
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :return:去极值后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """

    def _mad(series):
        if series.dropna().size==0:
            return series
        median = series.median()
        tmp = (series - median).abs().median()
        return series.clip(median - 5 * tmp, median + 5 * tmp)

    factor_df = jutil.fillinf(factor_df)
    factor_df = _mask_non_index_member(factor_df, index_member)
    return factor_df.apply(lambda x: _mad(x), axis=1)
Beispiel #3
0
def winsorize(factor_df, alpha=0.05, index_member=None):
    """
    对因子值做去极值操作
    :param index_member:
    :param alpha: 极值范围
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	        BA	   CMG	     DAL	      LULU	  
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :return:去极值后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """

    def winsorize_series(se):
        q = se.quantile([alpha / 2, 1 - alpha / 2])
        se[se < q.iloc[0]] = q.iloc[0]
        se[se > q.iloc[1]] = q.iloc[1]
        return se

    factor_df = jutil.fillinf(factor_df)
    factor_df = _mask_non_index_member(factor_df, index_member)
    return factor_df.apply(lambda x: winsorize_series(x), axis=1)
Beispiel #4
0
def standardize(factor_df, index_member=None):
    """
    对因子值做z-score标准化-算样本方差选择自由度为n-1
    :param index_member:
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	        BA	   CMG	     DAL	      LULU	  
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :return:z-score标准化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """

    factor_df = jutil.fillinf(factor_df)
    factor_df = _mask_non_index_member(factor_df, index_member)
    return factor_df.sub(factor_df.mean(axis=1), axis=0).div(factor_df.std(axis=1), axis=0)
Beispiel #5
0
def test_pdutil():
    df = pd.DataFrame(np.random.rand(4, 20))
    df.iloc[1, 2] = np.nan
    df.iloc[3, 4] = np.nan
    df.iloc[1, 4] = np.nan
    assert df.isnull().sum().sum() == 3
    df.iloc[2, 11] = np.inf
    df.iloc[2, 12] = -np.inf
    assert df.isnull().sum().sum() == 3
    df2 = jutil.fillinf(df)
    assert df2.isnull().sum().sum() == 5

    res_q = jutil.to_quantile(df, 5, axis=1)

    df3 = df.copy()
    df3['group'] = ['a', 'a', 'b', 'a']

    dic = jutil.group_df_to_dict(df3, by='group')
    assert set(list(dic.keys())) == {'a', 'b'}
Beispiel #6
0
 def standarize_factors(factors):
     if isinstance(factors, pd.DataFrame):
         factors_dict = {"factor": factors}
     else:
         factors_dict = factors
     factor_name_list = factors_dict.keys()
     for factor_name in factor_name_list:
         factors_dict[factor_name] = jutil.fillinf(factors_dict[factor_name])
         factors_dict[factor_name] = process._mask_non_index_member(factors_dict[factor_name],
                                                                    index_member=index_member)
         if winsorization:
             factors_dict[factor_name] = process.winsorize(factors_dict[factor_name])
         if standardize_type == "z_score":
             factors_dict[factor_name] = process.standardize(factors_dict[factor_name])
         elif standardize_type == "rank":
             factors_dict[factor_name] = process.rank_standardize(factors_dict[factor_name])
         elif standardize_type is not None:
             raise ValueError("standardize_type 只能为'z_score'/'rank'/None")
     return factors_dict
Beispiel #7
0
def rank_standardize(factor_df, index_member=None):
    """
    输入因子值, 将因子用排序分值重构,并处理到0-1之间(默认为升序——因子越大 排序分值越大(越好)
        :param index_member:
        :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	        BA	   CMG	     DAL	      LULU	  
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902

    :return: 排序重构后的因子值。 取值范围在0-1之间
    """
    factor_df = jutil.fillinf(factor_df)
    factor_df = _mask_non_index_member(factor_df, index_member)
    return jutil.rank_with_mask(factor_df, axis=1, normalize=True)
Beispiel #8
0
def orthogonalize(factors_dict=None,
                  standardize_type="z_score",
                  winsorization=False,
                  index_member=None):
    """
    # 因子间存在较强同质性时,使用施密特正交化方法对因子做正交化处理,用得到的正交化残差作为因子
    :param index_member:
    :param factors_dict: 若干因子组成的字典(dict),形式为:
                         {"factor_name_1":factor_1,"factor_name_2":factor_2}
                        每个因子值格式为一个pd.DataFrame,索引(index)为date,column为asset
    :param standardize_type: 标准化方法,有"rank"(排序标准化),"z_score"(z-score标准化)两种("rank"/"z_score")
    :return: factors_dict(new) 正交化处理后所得的一系列新因子。
    """

    from scipy import linalg
    from functools import partial

    def Schmidt(data):
        return linalg.orth(data)

    def get_vector(date, factor):
        return factor.loc[date]

    if not factors_dict or len(list(factors_dict.keys())) < 2:
        raise ValueError("你需要给定至少2个因子")

    new_factors_dict = {}  # 用于记录正交化后的因子值
    for factor_name in factors_dict.keys():
        new_factors_dict[factor_name] = []
        # 处理非法值
        factors_dict[factor_name] = jutil.fillinf(factors_dict[factor_name])
        factors_dict[factor_name] = process._mask_non_index_member(
            factors_dict[factor_name], index_member=index_member)
        if winsorization:
            factors_dict[factor_name] = process.winsorize(
                factors_dict[factor_name])

    factor_name_list = list(factors_dict.keys())
    factor_value_list = list(factors_dict.values())
    # 施密特正交
    for date in factor_value_list[0].index:
        data = list(map(partial(get_vector, date), factor_value_list))
        data = pd.concat(data, axis=1, join="inner")
        data = data.dropna()
        if len(data) == 0:
            continue
        data = pd.DataFrame(Schmidt(data), index=data.index)
        data.columns = factor_name_list
        for factor_name in factor_name_list:
            row = pd.DataFrame(data[factor_name]).T
            row.index = [
                date,
            ]
            new_factors_dict[factor_name].append(row)

    # 因子标准化
    for factor_name in factor_name_list:
        factor_value = pd.concat(new_factors_dict[factor_name])
        # 恢复在正交化过程中剔除的行和列
        factor_value = factor_value.reindex(
            index=factor_value_list[0].index,
            columns=factor_value_list[0].columns)
        if standardize_type == "z_score":
            new_factors_dict[factor_name] = process.standardize(
                factor_value, index_member)
        else:
            new_factors_dict[factor_name] = process.rank_standardize(
                factor_value, index_member)

    return new_factors_dict
Beispiel #9
0
def neutralize(factor_df,
               group,
               float_mv=None,
               index_member=None):
    """
    对因子做行业、市值中性化
    :param index_member:
    :param group: 行业分类(pandas.Dataframe类型),index为datetime, colunms为股票代码
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	        BA	   CMG	     DAL	      LULU	  
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :param float_mv: 流通市值因子(pandas.Dataframe类型),index为datetime, colunms为股票代码.为空则不进行市值中性化
    :return: 中性化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """
    def drop_nan(s):
        return s[s != "nan"]

    def _ols_by_numpy(x, y):
        m = np.linalg.lstsq(x, y)[0]
        resid = y - (x@m)
        return resid

    def _generate_cross_sectional_residual(data):
        for _, X in data.groupby(level=0):
            signal = X.pop("signal")
            X = pd.concat([X, pd.get_dummies(X.pop("industry"))], axis=1)
            signal = pd.Series(_ols_by_numpy(X.values, signal), index=signal.index, name=signal.name)
            yield signal

    data = []

    # 用于恢复原先的索引和列
    origin_factor_columns = factor_df.columns
    origin_factor_index = factor_df.index

    factor_df = jutil.fillinf(factor_df)  # 调整非法值
    factor_df = _mask_non_index_member(factor_df, index_member)  # 剔除非指数成份股
    factor_df = factor_df.dropna(how="all").stack().rename("signal")  # 删除全为空的截面
    data.append(factor_df)

    # 获取对数流动市值,并去极值、标准化。市值类因子不需进行这一步
    if float_mv is not None:
        float_mv = standardize(mad(np.log(float_mv), index_member=index_member), index_member).stack().rename("style")
        data.append(float_mv)

    # 行业
    industry_standard = drop_nan(group.stack()).rename("industry")
    data.append(industry_standard)

    data = pd.concat(data,axis=1).dropna()
    residuals = pd.concat(_generate_cross_sectional_residual(data)).unstack()

    # 恢复在中性化过程中剔除的行和列
    residuals.reindex(index=origin_factor_index,columns=origin_factor_columns)
    return residuals.reindex(index=origin_factor_index,columns=origin_factor_columns)
Beispiel #10
0
    def get_signal_data(self, signal):
        """
        Returns
        -------
        res : pd.DataFrame
            Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'upside_ret(N)','downside_ret(N)','quantile']
        """
        self._judge(signal)  # 判断signal与其他关键参数是否格式一致
        self._cal_ret()  # 计算信号收益
        signal = jutil.fillinf(signal)
        signal = signal.shift(1)  # avoid forward-looking bias

        # forward or not
        if not self.forward:
            signal = signal.shift(self.period)

        # 处理mask
        mask = np.logical_or(self.mask, signal.isnull())

        # calculate quantile
        signal_masked = signal.copy()
        signal_masked = signal_masked[~mask]
        if self.n_quantiles == 1:
            df_quantile = signal_masked.copy()
            df_quantile.loc[:, :] = 1.0
        else:
            df_quantile = jutil.to_quantile(signal_masked,
                                            n_quantiles=self.n_quantiles)

        # ----------------------------------------------------------------------
        # stack
        def stack_td_symbol(df):
            df = pd.DataFrame(df.stack(dropna=False))  # do not dropna
            df.index.names = ['trade_date', 'symbol']
            df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True)
            return df

        # ----------------------------------------------------------------------
        # concat signal value
        res = stack_td_symbol(signal)  # 信号
        res.columns = ['signal']

        for ret_type in self.signal_ret.keys():
            if self.signal_ret[ret_type] is not None:
                res[ret_type] = stack_td_symbol(
                    self.signal_ret[ret_type]).fillna(0)  # 收益

        if self.group is not None:
            res["group"] = stack_td_symbol(self.group)

        res['quantile'] = stack_td_symbol(df_quantile)  # quantile
        mask = stack_td_symbol(mask)
        res = res.loc[~(mask.iloc[:, 0]), :]

        if len(res) > 0:
            print("Nan Data Count (should be zero) : {:d};  " \
                  "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(),
                                                                 len(res) * 100. / signal.size))
        else:
            print("No signal available.")
        res = res.astype({'signal': float, 'return': float, 'quantile': int})
        return res
Beispiel #11
0
def neutralize(factor_df, group, float_mv=None, index_member=None):
    """
    对因子做行业、市值中性化
    :param index_member:
    :param group: 行业分类(pandas.Dataframe类型),index为datetime, colunms为股票代码
    :param factor_df: 因子值 (pandas.Dataframe类型),index为datetime, colunms为股票代码。
                      形如:
                                   AAPL	        BA	   CMG	     DAL	      LULU	  
                        date
                        2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832
                        2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890
                        2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691
                        2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805
                        2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902
    :param float_mv: 流通市值因子(pandas.Dataframe类型),index为datetime, colunms为股票代码.为空则不进行市值中性化
    :return: 中性化后的因子值(pandas.Dataframe类型),index为datetime, colunms为股票代码。
    """
    assert np.all(factor_df.index == group.index)
    assert np.all(factor_df.columns == group.columns)

    # 获取对数流动市值,并去极值、标准化。市值类因子不需进行这一步
    if float_mv is not None:
        assert np.all(factor_df.index == float_mv.index)
        assert np.all(factor_df.columns == float_mv.columns)
        x1 = standardize(
            winsorize(np.log(float_mv), index_member=index_member),
            index_member)

    factor_df = jutil.fillinf(factor_df)
    factor_df = _mask_non_index_member(factor_df, index_member)  # 剔除非指数成份股
    factor_df = factor_df.dropna(how="all")  # 删除全为空的截面
    result = []
    # 逐个截面进行回归,留残差作为中性化后的因子值
    for i in factor_df.index:
        # 获取行业分类信息
        X = pd.get_dummies(group.loc[i, :].dropna())
        if float_mv is not None:
            nfactors = len(X.columns) + 1
            DataAll = pd.concat([X, x1.loc[i], factor_df.loc[i]], axis=1)
        else:
            nfactors = len(X.columns)
            DataAll = pd.concat([X, factor_df.loc[i]], axis=1)
        # 剔除截面中值含空的股票
        DataAll = DataAll.dropna()
        if len(DataAll) == 0:
            continue
        DataAll.columns = list(range(0, nfactors + 1))
        regr = linear_model.LinearRegression(fit_intercept=False)
        regr.fit(np.matrix(DataAll.iloc[:, 0:nfactors]),
                 np.transpose(np.matrix(DataAll.iloc[:, nfactors])))
        residuals = np.transpose(np.matrix(
            DataAll.iloc[:, nfactors])) - regr.predict(
                np.matrix(DataAll.iloc[:, 0:nfactors]))
        residuals = pd.DataFrame(data=residuals,
                                 index=np.transpose(
                                     np.matrix(DataAll.index.values)))
        residuals.index = DataAll.index.values
        residuals.columns = [i]
        result.append(residuals)

    # 合并回归结果,恢复在中性化过程中剔除的行和列
    result = pd.concat(result, axis=1).reindex(factor_df.columns).T
    result = result.reindex(factor_df.index)
    return result
Beispiel #12
0
    def process_signal_before_analysis(self,
                                       signal,
                                       price=None,
                                       daily_ret=None,
                                       benchmark_price=None,
                                       daily_benchmark_ret=None,
                                       high=None,
                                       low=None,
                                       group=None,
                                       period=5,
                                       n_quantiles=5,
                                       mask=None,
                                       can_enter=None,
                                       can_exit=None,
                                       forward=True,
                                       commission=0.0008):
        """
        Prepare for signal analysis.

        Parameters
        ----------
        signal : pd.DataFrame
            Index is date, columns are stocks.
        price : pd.DataFrame
            Index is date, columns are stocks.
        high : pd.DataFrame
            Index is date, columns are stocks.
        low : pd.DataFrame
            Index is date, columns are stocks.
        daily_ret : pd.DataFrame
            Index is date, columns are stocks.
        daily_benchmark_ret : pd.DataFrame or pd.Series or None
            Daily ret of benchmark.
        group : pd.DataFrame
            Index is date, columns are stocks.
        benchmark_price : pd.DataFrame or pd.Series or None
            Price of benchmark.
        mask : pd.DataFrame
            Data cells that should NOT be used.
        can_enter: pd.DataFrame
            Date the security can be traded and BUY.
        can_exit:pd.DataFrame
            Date the security can be traded and SELL.
        n_quantiles : int
        period : int
            periods to compute forward returns on.
        forward :bool
            Return cal method. True by default.
        commission: float
            commission ratio per trade.

        Returns
        -------
        res : pd.DataFrame
            Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'upside_ret(N)','downside_ret(N)','quantile']
        """
        """
        Deal with suspensions:
            If the period of calculating return is d (from T to T+d), then
            we do not use signal values of those suspended on T,
            we do not calculate return for those suspended on T+d.
        """
        # ----------------------------------------------------------------------
        # parameter validation
        if price is None and daily_ret is None:
            raise ValueError("One of price / daily_ret must be provided.")
        if price is not None and daily_ret is not None:
            raise ValueError(
                "Only one of price / daily_ret should be provided.")
        if benchmark_price is not None and daily_benchmark_ret is not None:
            raise ValueError(
                "Only one of benchmark_price / daily_benchmark_ret should be provided."
            )
        if not (n_quantiles > 0 and isinstance(n_quantiles, int)):
            raise ValueError(
                "n_quantiles must be a positive integer. Input is: {}".format(
                    n_quantiles))

        if daily_ret is not None:
            warnings.warn(
                "Warning: 检查到使用daily_ret模式。未避免未来函数,请注意确保daily_ret格式为对应日期能实现的日收益."
            )

        # ensure inputs are aligned
        if mask is not None:
            try:
                assert np.all(signal.index == mask.index)
                assert np.all(signal.columns == mask.columns)
            except:
                warnings.warn("Warning: signal与mask的index/columns不一致,请检查输入参数!")
                mask = mask.reindex_like(signal)
            mask = jutil.fillinf(mask)
            mask = mask.astype(int).fillna(0).astype(
                bool)  # dtype of mask could be float. So we need to convert.
        else:
            mask = pd.DataFrame(index=signal.index,
                                columns=signal.columns,
                                data=False)
        if can_enter is not None:
            try:
                assert np.all(signal.index == can_enter.index)
                assert np.all(signal.columns == can_enter.columns)
            except:
                warnings.warn(
                    "Warning: signal与can_enter的index/columns不一致,请检查输入参数!")
                can_enter = can_enter.reindex_like(signal)
            can_enter = jutil.fillinf(can_enter)
            can_enter = can_enter.astype(int).fillna(0).astype(
                bool
            )  # dtype of can_enter could be float. So we need to convert.
        else:
            can_enter = pd.DataFrame(index=signal.index,
                                     columns=signal.columns,
                                     data=True)
        if can_exit is not None:
            try:
                assert np.all(signal.index == can_exit.index)
                assert np.all(signal.columns == can_exit.columns)
            except:
                warnings.warn(
                    "Warning: signal与can_exit的index/columns不一致,请检查输入参数!")
                can_exit = can_exit.reindex_like(signal)
            can_exit = jutil.fillinf(can_exit)
            can_exit = can_exit.astype(int).fillna(0).astype(
                bool
            )  # dtype of can_exit could be float. So we need to convert.
        else:
            can_exit = pd.DataFrame(index=signal.index,
                                    columns=signal.columns,
                                    data=True)
        if group is not None:
            try:
                assert np.all(signal.index == group.index)
                assert np.all(signal.columns == group.columns)
            except:
                warnings.warn(
                    "Warning: signal与group的index/columns不一致,请检查输入参数!")
                group = group.reindex_like(signal)
            group = group.astype(str)

        # ----------------------------------------------------------------------
        # save data
        self.n_quantiles = n_quantiles
        self.period = period

        # ----------------------------------------------------------------------
        # Get dependent variables

        # 计算benchmark收益
        self.benchmark_ret = None
        if benchmark_price is not None:
            benchmark_price = benchmark_price.reindex(index=signal.index)
            self.benchmark_ret = pfm.price2ret(benchmark_price,
                                               self.period,
                                               axis=0,
                                               compound=True)
        elif daily_benchmark_ret is not None:
            daily_benchmark_ret = daily_benchmark_ret.reindex(
                index=signal.index)
            self.benchmark_ret = pfm.daily_ret_to_ret(daily_benchmark_ret,
                                                      self.period)

        # 计算区间持仓收益
        isRealPrice = False
        if daily_ret is not None:
            try:
                assert np.all(signal.index == daily_ret.index)
                assert np.all(signal.columns == daily_ret.columns)
            except:
                warnings.warn(
                    "Warning: signal与daily_ret的index/columns不一致,请检查输入参数!")
                daily_ret = daily_ret.reindex_like(signal)
            daily_ret = jutil.fillinf(daily_ret).fillna(0)
            price = pfm.daily_ret_to_cum(daily_ret)
        else:
            # 有price
            isRealPrice = True
            try:
                assert np.all(signal.index == price.index)
                assert np.all(signal.columns == price.columns)
            except:
                warnings.warn(
                    "Warning: signal与price的index/columns不一致,请检查输入参数!")
                price = price.reindex_like(signal)
            price = jutil.fillinf(price)

        can_enter = np.logical_and(price != np.NaN, can_enter)
        df_ret = pfm.price2ret(price,
                               period=self.period,
                               axis=0,
                               compound=True)
        price_can_exit = price.copy()
        price_can_exit[~can_exit] = np.NaN
        price_can_exit = price_can_exit.fillna(method="bfill")
        ret_can_exit = pfm.price2ret(price_can_exit,
                                     period=self.period,
                                     axis=0,
                                     compound=True)
        df_ret[~can_exit] = ret_can_exit[~can_exit]

        if self.benchmark_ret is not None:
            # 计算持有期相对收益
            residual_ret = df_ret.sub(self.benchmark_ret.values.flatten(),
                                      axis=0)
        else:
            residual_ret = df_ret
        residual_ret = jutil.fillinf(residual_ret)
        residual_ret -= commission

        # 计算潜在上涨空间和潜在下跌空间
        if high is not None and isRealPrice:
            try:
                assert np.all(signal.index == high.index)
                assert np.all(signal.columns == high.columns)
            except:
                warnings.warn("Warning: signal与high的index/columns不一致,请检查输入参数!")
                high = high.reindex_like(signal)
            high = jutil.fillinf(high)
        else:
            high = price
        upside_ret = compute_upside_returns(price,
                                            high,
                                            can_exit,
                                            self.period,
                                            compound=True)
        upside_ret = jutil.fillinf(upside_ret)
        upside_ret -= commission

        if low is not None and isRealPrice:
            try:
                assert np.all(signal.index == low.index)
                assert np.all(signal.columns == low.columns)
            except:
                warnings.warn("Warning: signal与low的index/columns不一致,请检查输入参数!")
                low = low.reindex_like(signal)
            low = jutil.fillinf(low)
        else:
            low = price
        downside_ret = compute_downside_returns(price,
                                                low,
                                                can_exit,
                                                self.period,
                                                compound=True)
        downside_ret = jutil.fillinf(downside_ret)
        downside_ret -= commission

        # ----------------------------------------------------------------------
        # Get independent varibale
        signal = jutil.fillinf(signal)
        signal = signal.shift(1)  # avoid forward-looking bias
        # forward or not
        if forward:
            # point-in-time signal and forward return
            residual_ret = residual_ret.shift(-self.period)
            upside_ret = upside_ret.shift(-self.period)
            downside_ret = downside_ret.shift(-self.period)
        else:
            # past signal and point-in-time return
            signal = signal.shift(self.period)
            can_enter = can_enter.shift(self.period)
            mask = mask.shift(self.period)

        self.ret = dict()
        self.ret["return"] = residual_ret
        self.ret["upside_ret"] = upside_ret
        self.ret["downside_ret"] = downside_ret

        # ----------------------------------------------------------------------
        # get masks
        # mask_prices = data.isnull()
        # Because we use FORWARD return, if one day's price is broken, the day that is <period> days ago is also broken.
        # mask_prices = np.logical_or(mask_prices, mask_prices.shift(self.period))
        # mask_price_return = residual_ret.isnull()
        mask_signal = signal.isnull()

        mask = np.logical_or(
            mask.fillna(True),
            np.logical_or(mask_signal, ~(can_enter.fillna(False))))
        mask = np.logical_or(mask, self.ret["return"].isnull())
        # mask = np.logical_or(mask, mask_signal)

        # if price is not None:
        #     mask_forward = np.logical_or(mask, mask.shift(self.period).fillna(True))
        #     mask = np.logical_or(mask, mask_forward)

        # ----------------------------------------------------------------------
        # calculate quantile
        signal_masked = signal.copy()
        signal_masked = signal_masked[~mask]
        if n_quantiles == 1:
            df_quantile = signal_masked.copy()
            df_quantile.loc[:, :] = 1.0
        else:
            if group is None:
                df_quantile = jutil.to_quantile(signal_masked,
                                                n_quantiles=n_quantiles)
            else:
                from jaqs_fxdayu.data.py_expression_eval import Parser
                ps = Parser()
                ps.index_member = None
                df_quantile = ps.group_quantile(df=signal_masked,
                                                group=group,
                                                n_quantiles=n_quantiles)
        # ----------------------------------------------------------------------
        # stack
        def stack_td_symbol(df):
            df = pd.DataFrame(df.stack(dropna=False))  # do not dropna
            df.index.names = ['trade_date', 'symbol']
            df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True)
            return df

        # ----------------------------------------------------------------------
        # concat signal value
        res = stack_td_symbol(signal)
        res.columns = ['signal']
        for ret_type in self.ret.keys():
            res[ret_type] = stack_td_symbol(self.ret[ret_type]).fillna(0)
        res['quantile'] = stack_td_symbol(df_quantile)
        if group is not None:
            res["group"] = stack_td_symbol(group)
        mask = stack_td_symbol(mask)
        res = res.loc[~(mask.iloc[:, 0]), :]

        if len(res) > 0:
            print("Nan Data Count (should be zero) : {:d};  " \
                  "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(),
                                                                 len(res) * 100. / signal.size))
        else:
            print("No signal available.")
        res = res.astype({'signal': float, 'return': float, 'quantile': int})
        self.signal_data = res
Beispiel #13
0
    def process_signal(self,
                       enter_signal,
                       exit_signal=None,
                       sig_type="long",
                       price=None,
                       daily_ret=None,
                       max_holding_period=None,
                       stoploss=None,
                       stopprofit=None,
                       mask=None,
                       can_enter=None,
                       can_exit=None,
                       group=None,
                       n_quantiles=1,
                       commission=0.0008):
        """
        Prepare for signal analysis.

        Parameters
        ----------
        enter_signal : pd.DataFrame
            Index is date, columns are stocks.value can only be -2/0/2
        exit_signal : pd.DataFrame/list of pd.DataFrame
            Index is date, columns are stocks.value can only be -1/0/1
        sig_type: str
            "long"/"short", which type of signal to process
        price : pd.DataFrame
            Index is date, columns are stocks.
        daily_ret : pd.DataFrame
            Index is date, columns are stocks.
        mask : pd.DataFrame
            Data cells that should NOT be used.
        can_enter: pd.DataFrame
            Date the security can open.
        can_exit:pd.DataFrame
            Date the security can close.
        max_holding_period : int
            Limit the max holding period
        stoploss:float
            stoploss ratio per trade
        stopprofit:float
            stopprofit ratio per trade
        n_quantiles: int
        group : pd.DataFrame
            Index is date, columns are stocks.
        commission: float
            commission ratio per trade.
        Returns
        -------
        res : pd.DataFrame
            Signal processed
        """
        # ensure inputs are aligned
        # parameter validation
        if sig_type not in ["long", "short"]:
            raise ValueError("信号类型(sig_type)只能为long/short.")

        if price is None and daily_ret is None:
            raise ValueError("One of price / daily_ret must be provided.")
        if price is not None and daily_ret is not None:
            raise ValueError(
                "Only one of price / daily_ret should be provided.")
        if not (n_quantiles > 0 and isinstance(n_quantiles, int)):
            raise ValueError(
                "n_quantiles must be a positive integer. Input is: {}".format(
                    n_quantiles))

        enter_signal = jutil.fillinf(enter_signal)
        if n_quantiles == 1:  # 事件类进场信号
            # 确保enter_signal里的信号只能为-2(开空),0(不做操作),2(开多)
            enter_signal = enter_signal.fillna(0)
            if not enter_signal.isin([-2, 0, 2]).all().all():
                raise ValueError("检测到n_quantiles为1,该模式下测试的enter_signal为事件类因子."
                                 "请确保enter_signal里的信号只能为-2(开空),0(不做操作),2(开多))."
                                 "如需测试普通因子,请指定n_quantiles为大于1的整数.")
            # 确保至少有一种出场信号
            if (exit_signal is None) and (max_holding_period is None) and \
                    (stoploss is None) and (stopprofit is None):
                raise ValueError(
                    "确保至少有一种出场信号(exit_signal/max_holding_period/stoploss/stopprofit)"
                )
        else:  # 普通进场信号
            if max_holding_period is None:
                raise ValueError("检测到n_quantiles不为1,该模式下测试的enter_signal为普通因子."
                                 "该模式下,max_holding_period参数不能为空.")
            self.period = max_holding_period

        if exit_signal is not None:
            # 确保exit_signal里的信号只能为-1(平空),0(不做操作),1(平多)
            if not isinstance(exit_signal, list):
                exit_signal = [exit_signal]
            for i in range(len(exit_signal)):
                exit_signal[i] = exit_signal[i].reindex_like(enter_signal)
                exit_signal[i] = jutil.fillinf(exit_signal[i]).fillna(0)
                if not exit_signal[i].isin([-1, 0, 1]).all().all():
                    raise ValueError(
                        "请确保所有exit_signal里的信号只能为-1(平空),0(不做操作),1(平多)")
        else:
            exit_signal = []

        if group is not None:
            group = group.reindex_like(enter_signal)

        sig_filter = {
            "mask": mask,
            "can_enter": can_enter,
            "can_exit": can_exit,
        }

        for _filter in sig_filter.keys():
            if sig_filter[_filter] is not None:
                sig_filter[_filter] = sig_filter[_filter].reindex_like(
                    enter_signal)
                sig_filter[_filter] = jutil.fillinf(
                    sig_filter[_filter]).astype(int).fillna(0)
            else:
                sig_filter[_filter] = pd.DataFrame(
                    index=enter_signal.index,
                    columns=enter_signal.columns,
                    data=0 if _filter == "mask" else 1)

        # process
        #=============================================================
        # 信号在当天的收盘时候统计,具体执行则在下一天的交易日的开盘--设置price=open,
        # 或下一天交易日的收盘--设置price=close,或别的价格--如设置price=vwap
        # 防止未来函数
        enter_signal = enter_signal.shift(1)
        for i in range(len(exit_signal)):
            exit_signal[i] = exit_signal[i].shift(1)

        # 处理价格数据
        if daily_ret is not None:
            daily_ret = daily_ret.reindex_like(enter_signal)
            daily_ret = jutil.fillinf(daily_ret).fillna(0)
            price = pfm.daily_ret_to_cum(daily_ret)  # 取净值
        else:
            # 有price
            price = price.reindex_like(enter_signal)
            price = jutil.fillinf(price)  # 取价格

        self.price = price

        #=====================
        # 调整出场点
        pos = []
        # 定时出场位置
        if max_holding_period is not None:
            pos.append(
                get_period_exit_pos(enter_signal, period=max_holding_period))

        # 止损出场位置
        if stoploss is not None:
            pos.append(
                get_stop_pos(price,
                             stoploss,
                             sig_type=sig_type,
                             stop_type="stop_loss"))

        # 止盈出场位置
        if stopprofit is not None:
            pos.append(
                get_stop_pos(price,
                             stopprofit,
                             sig_type=sig_type,
                             stop_type="stop_profit"))

        # 自定义出场信号位置
        for es in exit_signal:
            pos.append(get_exit_pos(es, exit_type="close_%s" % (sig_type, )))

        # 综合了各种出场条件,选择最先触发的出场条件出场
        exit_pos = reduce(get_first_pos, pos).replace(LONGINT, np.nan)
        # 每天允许出场的最近的出场点
        exit_permited_pos = get_exit_pos(sig_filter["can_exit"], value=[1])
        self.final_exit_pos[sig_type] = get_exit_value(exit_permited_pos,
                                                       exit_pos)
        # =====================
        # 计算信号收益
        price_exit = get_exit_value(price, self.final_exit_pos[sig_type])
        ret_exit = jutil.fillinf((price_exit - price) / price)
        if sig_type == "short":
            ret_exit = -1 * ret_exit
        self.ret[sig_type] = ret_exit - commission

        # =====================
        # 计算signal_data
        # ----------------------------------------------------------------------
        # mask signal
        if n_quantiles == 1:  # 事件因子
            if sig_type == "long":
                value = 2
            else:
                value = -2
            mask_signal = enter_signal != value
        else:  # 普通因子
            mask_signal = enter_signal.isnull()

        mask_signal = np.logical_or(
            mask_signal,
            np.logical_or(sig_filter["mask"], sig_filter["can_enter"] != 1))
        mask_signal = np.logical_or(mask_signal, self.ret[sig_type].isnull())

        # ban掉出场信号在进场那天的
        # get sig pos
        sig_pos = get_sig_pos(self.final_exit_pos[sig_type])
        mask_signal = np.logical_or(mask_signal,
                                    sig_pos == self.final_exit_pos[sig_type])

        # calculate quantile
        if n_quantiles == 1:
            df_quantile = pd.DataFrame(1,
                                       index=enter_signal.index,
                                       columns=enter_signal.columns)
        else:
            signal_masked = enter_signal.copy()
            signal_masked = signal_masked[~mask_signal]
            if group is None:
                df_quantile = jutil.to_quantile(signal_masked,
                                                n_quantiles=n_quantiles)
            else:
                from jaqs_fxdayu.data.py_expression_eval import Parser
                ps = Parser()
                ps.index_member = None
                df_quantile = ps.group_quantile(df=signal_masked,
                                                group=group,
                                                n_quantiles=n_quantiles)

        # ----------------------------------------------------------------------
        # concat signal value
        res = stack_td_symbol(enter_signal)
        res.columns = ['signal']
        res["return"] = stack_td_symbol(self.ret[sig_type])
        res["exit_time"] = stack_td_symbol(self.final_exit_pos[sig_type])
        res['quantile'] = stack_td_symbol(df_quantile)
        if group is not None:
            res["group"] = stack_td_symbol(group)
        res["sig_type"] = sig_type
        mask_signal = stack_td_symbol(mask_signal)
        res = res.loc[~(mask_signal.iloc[:, 0]), :]

        if len(res) > 0:
            print("Nan Data Count (should be zero) : {:d};  " \
                  "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(),
                                                                 len(res) * 100. / enter_signal.size))
            res = res.astype({
                'signal': float,
                'return': float,
                'quantile': int
            })
            self.signal_data[sig_type] = res
        else:
            print("sig_type %s:No signal available." % (sig_type, ))
Beispiel #14
0
    def process_signal_before_analysis(self,
                                       signal, price=None, ret=None, benchmark_price=None,
                                       period=5, n_quantiles=5,
                                       mask=None,
                                       forward=False):
        """
        Prepare for signal analysis.

        Parameters
        ----------
        signal : pd.DataFrame
            Index is date, columns are stocks.
        price : pd.DataFrame
            Index is date, columns are stocks.
        ret : pd.DataFrame
            Index is date, columns are stocks.
        benchmark_price : pd.DataFrame or pd.Series or None
            Price of benchmark.
        mask : pd.DataFrame
            Data cells that should NOT be used.
        n_quantiles : int
        period : int
            periods to compute forward returns on.

        Returns
        -------
        res : pd.DataFrame
            Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'quantile']
            
        """
        """
        Deal with suspensions:
            If the period of calculating return is d (from T to T+d), then
            we do not use signal values of those suspended on T,
            we do not calculate return for those suspended on T+d.
        """
        # ----------------------------------------------------------------------
        # parameter validation
        if price is None and ret is None:
            raise ValueError("One of price / ret must be provided.")
        if price is not None and ret is not None:
            raise ValueError("Only one of price / ret should be provided.")
        if ret is not None and benchmark_price is not None:
            raise ValueError("You choose 'return' mode but benchmark_price is given.")
        if not (n_quantiles > 0 and isinstance(n_quantiles, int)):
            raise ValueError("n_quantiles must be a positive integer. Input is: {}".format(n_quantiles))
        
        # ensure inputs are aligned
        data = price if price is not None else ret
        assert np.all(signal.index == data.index)
        assert np.all(signal.columns == data.columns)
        if mask is not None:
            assert np.all(signal.index == mask.index)
            assert np.all(signal.columns == mask.columns)
            mask = jutil.fillinf(mask)
            mask = mask.astype(int).fillna(0).astype(bool)  # dtype of mask could be float. So we need to convert.
        else:
            mask = pd.DataFrame(index=signal.index, columns=signal.columns, data=False)
        signal = jutil.fillinf(signal)
        data = jutil.fillinf(data)

        # ----------------------------------------------------------------------
        # save data
        self.n_quantiles = n_quantiles
        self.period = period

        # ----------------------------------------------------------------------
        # Get dependent variables
        if price is not None:
            df_ret = pfm.price2ret(price, period=self.period, axis=0)
            if benchmark_price is not None:
                benchmark_price = benchmark_price.loc[signal.index]
                bench_ret = pfm.price2ret(benchmark_price, self.period, axis=0)
                self.benchmark_ret = bench_ret
                residual_ret = df_ret.sub(bench_ret.values.flatten(), axis=0)
            else:
                residual_ret = df_ret
        else:
            residual_ret = ret
        
        # Get independent varibale
        signal = signal.shift(1)  # avoid forward-looking bias

        # forward or not
        if forward:
            # point-in-time signal and forward return
            residual_ret = residual_ret.shift(-self.period)
        else:
            # past signal and point-in-time return
            signal = signal.shift(self.period)

        # ----------------------------------------------------------------------
        # get masks
        # mask_prices = data.isnull()
        # Because we use FORWARD return, if one day's price is broken, the day that is <period> days ago is also broken.
        # mask_prices = np.logical_or(mask_prices, mask_prices.shift(self.period))
        mask_price_return = residual_ret.isnull()
        mask_signal = signal.isnull()

        mask = np.logical_or(mask_signal, mask_price_return)
        # mask = np.logical_or(mask, mask_signal)

        # if price is not None:
        #     mask_forward = np.logical_or(mask, mask.shift(self.period).fillna(True))
        #     mask = np.logical_or(mask, mask_forward)

        # ----------------------------------------------------------------------
        # calculate quantile
        signal_masked = signal.copy()
        signal_masked = signal_masked[~mask]
        if n_quantiles == 1:
            df_quantile = signal_masked.copy()
            df_quantile.loc[:, :] = 1.0
        else:
            df_quantile = jutil.to_quantile(signal_masked, n_quantiles=n_quantiles)

        # ----------------------------------------------------------------------
        # stack
        def stack_td_symbol(df):
            df = pd.DataFrame(df.stack(dropna=False))  # do not dropna
            df.index.names = ['trade_date', 'symbol']
            df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True)
            return df
        
        mask = stack_td_symbol(mask)
        df_quantile = stack_td_symbol(df_quantile)
        residual_ret = stack_td_symbol(residual_ret)

        # ----------------------------------------------------------------------
        # concat signal value
        res = stack_td_symbol(signal)
        res.columns = ['signal']
        res['return'] = residual_ret
        res['quantile'] = df_quantile
        res = res.loc[~(mask.iloc[:, 0]), :]
        
        print("Nan Data Count (should be zero) : {:d};  " \
              "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(),
                                                             len(res) * 100. / signal.size))
        res = res.astype({'signal': float, 'return': float, 'quantile': int})
        self.signal_data = res
Beispiel #15
0
def get_factors_ret_df(factors_dict,
                       price,
                       high=None,
                       low=None,
                       group=None,
                       benchmark_price=None,
                       period=5,
                       quantiles=5,
                       mask=None,
                       can_enter=None,
                       can_exit=None,
                       commission=0.0008,
                       forward=True,
                       ret_type="return",
                       **kwargs):
    """
    获取多个因子收益序列矩阵
    :param factors_dict: 若干因子组成的字典(dict),形式为:
                         {"factor_name_1":factor_1,"factor_name_2":factor_2}
    :param period: 指定持有周期(int)
    :param quantiles: 根据因子大小将股票池划分的分位数量(int)
    :param price : 包含了pool中所有股票的价格时间序列(pd.Dataframe),索引(index)为datetime,columns为各股票代码,与pool对应。
    :param benchmark_price:基准收益,不为空计算相对收益,否则计算绝对收益
    :return: ret_df 多个因子收益序列矩阵
             类型pd.Dataframe,索引(index)为datetime,columns为各因子名称,与factors_dict中的对应。
             如:

                       BP	   CFP	   EP	  ILLIQUIDITY	REVS20	   SRMI	   VOL20
            date
            2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832	0.214377	0.068445
            2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890	0.202724	0.081748
            2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691	0.122554	0.042489
            2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805	0.053339	0.079592
            2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902	0.077293	-0.050667
    """
    def stack_td_symbol(df):
        df = pd.DataFrame(df.stack(dropna=False))  # do not dropna
        df.index.names = ['trade_date', 'symbol']
        df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True)
        return df

    def get_regression_result(df):
        ret = df.pop("return")
        if "group" in df.columns:
            df = df.drop("group", axis=1)
        ols_model = sm.OLS(ret, df)
        regression_results = ols_model.fit()
        return regression_results.params

    if ret_type is None:
        ret_type = "return"

    if not (ret_type in ["return", "upside_ret", "downside_ret"]):
        raise ValueError(
            "不支持对%s收益的ic计算!支持的收益类型有return, upside_ret, downside_ret." %
            (ret_type, ))

    sc = SignalCreator(price,
                       high=high,
                       low=low,
                       group=group,
                       benchmark_price=benchmark_price,
                       period=period,
                       n_quantiles=quantiles,
                       mask=mask,
                       can_enter=can_enter,
                       can_exit=can_exit,
                       forward=forward,
                       commission=commission)

    res = None

    # 获取factor_value的时间(index),将用来生成 factors_ic_df 的对应时间(index)
    times = sorted(
        pd.concat([
            pd.Series(factors_dict[factor_name].index)
            for factor_name in factors_dict.keys()
        ]).unique())
    for factor_name in factors_dict.keys():
        signal = factors_dict[factor_name]
        if (not isinstance(signal, pd.DataFrame)) or (signal.size == 0):
            raise ValueError("因子%s为空或不合法!请确保传入因子有值且数据类型为pandas.DataFrame." %
                             (factor_name, ))
        sc._judge(signal)
        sc._cal_ret()
        if ret_type not in sc.signal_ret.keys():
            raise ValueError("无法计算%s收益,请重新设置输入参数." % (ret_type, ))
        if res is None:
            res = stack_td_symbol(sc.signal_ret[ret_type]).fillna(0)
            res.columns = ["return"]
        signal = jutil.fillinf(signal)
        signal = signal.shift(1)  # avoid forward-looking bias
        if not forward:
            signal = signal.shift(period)
        res[factor_name] = stack_td_symbol(signal)

    grouper = ['trade_date']
    if group is not None:
        res["group"] = stack_td_symbol(group)
        grouper.append('group')

    res = res.dropna()
    result = res.groupby(grouper).apply(get_regression_result)

    if group is None:
        result = result.dropna(how="all").reindex(times)
    else:
        result = result.dropna(how="all")
        result = result.reindex(
            pd.MultiIndex.from_product([times, result.index.levels[1]],
                                       names=["trade_date", "group"]))
    return result
Beispiel #16
0
    def _cal_ret(self):
        if self.signal_ret is not None:
            return
        else:
            # 计算benchmark收益
            if self.benchmark_price is not None:
                self.benchmark_ret = pfm.price2ret(self.benchmark_price,
                                                   self.period,
                                                   axis=0,
                                                   compound=True)
            elif self.daily_benchmark_ret is not None:
                self.benchmark_ret = pfm.daily_ret_to_ret(
                    self.daily_benchmark_ret, self.period)

            # 计算区间持仓收益
            isRealPrice = False
            if self.daily_ret is not None:
                self.daily_ret = jutil.fillinf(self.daily_ret).fillna(0)
                self.price = pfm.daily_ret_to_cum(self.daily_ret)
            else:
                # 有price
                isRealPrice = True
                self.price = jutil.fillinf(self.price)

            self.can_enter = np.logical_and(self.price != np.NaN,
                                            self.can_enter)
            df_ret = pfm.price2ret(self.price,
                                   period=self.period,
                                   axis=0,
                                   compound=True)
            price_can_exit = self.price.copy()
            price_can_exit[~self.can_exit] = np.NaN
            price_can_exit = price_can_exit.fillna(method="bfill")
            ret_can_exit = pfm.price2ret(price_can_exit,
                                         period=self.period,
                                         axis=0,
                                         compound=True)
            df_ret[~self.can_exit] = ret_can_exit[~self.can_exit]

            if self.benchmark_ret is not None:
                # 计算持有期相对收益
                self.benchmark_ret = self.benchmark_ret.reindex(df_ret.index)
                residual_ret = df_ret.sub(self.benchmark_ret.values.flatten(),
                                          axis=0)
            else:
                residual_ret = df_ret
            residual_ret = jutil.fillinf(residual_ret)
            residual_ret -= self.commission

            # 计算潜在上涨空间和潜在下跌空间
            if self.high is not None and isRealPrice:
                self.high = jutil.fillinf(self.high)
            else:
                self.high = self.price
            upside_ret = compute_upside_returns(self.price,
                                                self.high,
                                                self.can_exit,
                                                self.period,
                                                compound=True)
            upside_ret = jutil.fillinf(upside_ret)
            upside_ret -= self.commission

            if self.low is not None and isRealPrice:
                self.low = jutil.fillinf(self.low)
            else:
                self.low = self.price
            downside_ret = compute_downside_returns(self.price,
                                                    self.low,
                                                    self.can_exit,
                                                    self.period,
                                                    compound=True)
            downside_ret = jutil.fillinf(downside_ret)
            downside_ret -= self.commission

            self.signal_ret = {
                "return": residual_ret,
                "upside_ret": upside_ret,
                "downside_ret": downside_ret
            }
            if self.forward:
                for ret_type in self.signal_ret.keys():
                    if self.signal_ret[ret_type] is not None:
                        # point-in-time signal and forward return
                        self.signal_ret[ret_type] = self.signal_ret[
                            ret_type].shift(-self.period)
            else:
                self.can_enter = self.can_enter.shift(self.period)
                self.mask = self.mask.shift(self.period)

            # 处理mask
            self.mask = np.logical_or(self.mask.fillna(True),
                                      ~(self.can_enter.fillna(False)))
Beispiel #17
0
    def process_signal_before_analysis(self,
                                       signal,
                                       price=None,
                                       ret=None,
                                       benchmark_price=None,
                                       period=5,
                                       n_quantiles=5,
                                       mask=None,
                                       forward=False):
        """
        Prepare for signal analysis.

        Parameters
        ----------
        signal : pd.DataFrame
            Index is date, columns are stocks.
        price : pd.DataFrame
            Index is date, columns are stocks.
        ret : pd.DataFrame
            Index is date, columns are stocks.
        benchmark_price : pd.DataFrame or pd.Series or None
            Price of benchmark.
        mask : pd.DataFrame
            Data cells that should NOT be used.
        n_quantiles : int
        period : int
            periods to compute forward returns on.

        Returns
        -------
        res : pd.DataFrame
            Index is pd.MultiIndex ['trade_date', 'symbol'], columns = ['signal', 'return', 'quantile']
            
        """
        """
        Deal with suspensions:
            If the period of calculating return is d (from T to T+d), then
            we do not use signal values of those suspended on T,
            we do not calculate return for those suspended on T+d.
        """
        # ----------------------------------------------------------------------
        # parameter validation
        if price is None and ret is None:
            raise ValueError("One of price / ret must be provided.")
        if price is not None and ret is not None:
            raise ValueError("Only one of price / ret should be provided.")
        if ret is not None and benchmark_price is not None:
            raise ValueError(
                "You choose 'return' mode but benchmark_price is given.")
        if not (n_quantiles > 0 and isinstance(n_quantiles, int)):
            raise ValueError(
                "n_quantiles must be a positive integer. Input is: {}".format(
                    n_quantiles))

        # ensure inputs are aligned
        data = price if price is not None else ret
        assert np.all(signal.index == data.index)
        assert np.all(signal.columns == data.columns)
        if mask is not None:
            assert np.all(signal.index == mask.index)
            assert np.all(signal.columns == mask.columns)
            mask = jutil.fillinf(mask)
            mask = mask.astype(int).fillna(0).astype(
                bool)  # dtype of mask could be float. So we need to convert.
        else:
            mask = pd.DataFrame(index=signal.index,
                                columns=signal.columns,
                                data=False)
        signal = jutil.fillinf(signal)
        data = jutil.fillinf(data)

        # ----------------------------------------------------------------------
        # save data
        self.n_quantiles = n_quantiles
        self.period = period

        # ----------------------------------------------------------------------
        # Get dependent variables
        if price is not None:
            df_ret = pfm.price2ret(price, period=self.period, axis=0)
            if benchmark_price is not None:
                benchmark_price = benchmark_price.loc[signal.index]
                bench_ret = pfm.price2ret(benchmark_price, self.period, axis=0)
                self.benchmark_ret = bench_ret
                residual_ret = df_ret.sub(bench_ret.values.flatten(), axis=0)
            else:
                residual_ret = df_ret
        else:
            residual_ret = ret

        # Get independent varibale
        signal = signal.shift(1)  # avoid forward-looking bias

        # forward or not
        if forward:
            # point-in-time signal and forward return
            residual_ret = residual_ret.shift(-self.period)
        else:
            # past signal and point-in-time return
            signal = signal.shift(self.period)

        # ----------------------------------------------------------------------
        # get masks
        # mask_prices = data.isnull()
        # Because we use FORWARD return, if one day's price is broken, the day that is <period> days ago is also broken.
        # mask_prices = np.logical_or(mask_prices, mask_prices.shift(self.period))
        mask_price_return = residual_ret.isnull()
        mask_signal = signal.isnull()

        mask = np.logical_or(mask_signal, mask_price_return)
        # mask = np.logical_or(mask, mask_signal)

        # if price is not None:
        #     mask_forward = np.logical_or(mask, mask.shift(self.period).fillna(True))
        #     mask = np.logical_or(mask, mask_forward)

        # ----------------------------------------------------------------------
        # calculate quantile
        signal_masked = signal.copy()
        signal_masked = signal_masked[~mask]
        if n_quantiles == 1:
            df_quantile = signal_masked.copy()
            df_quantile.loc[:, :] = 1.0
        else:
            df_quantile = jutil.to_quantile(signal_masked,
                                            n_quantiles=n_quantiles)

        # ----------------------------------------------------------------------
        # stack
        def stack_td_symbol(df):
            df = pd.DataFrame(df.stack(dropna=False))  # do not dropna
            df.index.names = ['trade_date', 'symbol']
            df.sort_index(axis=0, level=['trade_date', 'symbol'], inplace=True)
            return df

        mask = stack_td_symbol(mask)
        df_quantile = stack_td_symbol(df_quantile)
        residual_ret = stack_td_symbol(residual_ret)

        # ----------------------------------------------------------------------
        # concat signal value
        res = stack_td_symbol(signal)
        res.columns = ['signal']
        res['return'] = residual_ret
        res['quantile'] = df_quantile
        res = res.loc[~(mask.iloc[:, 0]), :]

        print("Nan Data Count (should be zero) : {:d};  " \
              "Percentage of effective data: {:.0f}%".format(res.isnull().sum(axis=0).sum(),
                                                             len(res) * 100. / signal.size))
        res = res.astype({'signal': float, 'return': float, 'quantile': int})
        self.signal_data = res
Beispiel #18
0
def _process_filter(_filter):
    if _filter is not None:
        _filter = jutil.fillinf(_filter)
        _filter = _filter.astype(int).fillna(0).astype(bool)
    return _filter
Beispiel #19
0
def get_factors_ic_df(factors_dict,
                      price,
                      high=None,
                      low=None,
                      group=None,
                      benchmark_price=None,
                      period=5,
                      quantiles=5,
                      mask=None,
                      can_enter=None,
                      can_exit=None,
                      commisson=0.0008,
                      forward=True,
                      ret_type="return",
                      **kwargs):
    """
    获取多个因子ic值序列矩阵
    :param factors_dict: 若干因子组成的字典(dict),形式为:
                         {"factor_name_1":factor_1,"factor_name_2":factor_2}
    :param pool: 股票池范围(list),如:["000001.SH","600300.SH",......]
    :param start: 起始时间 (int)
    :param end: 结束时间 (int)
    :param period: 指定持有周期(int)
    :param quantiles: 根据因子大小将股票池划分的分位数量(int)
    :param price : 包含了pool中所有股票的价格时间序列(pd.Dataframe),索引(index)为datetime,columns为各股票代码,与pool对应。
    :param benchmark_price:基准收益,不为空计算相对收益,否则计算绝对收益
    :return: ic_df 多个因子ic值序列矩阵
             类型pd.Dataframe,索引(index)为datetime,columns为各因子名称,与factors_dict中的对应。
             如:

                       BP	   CFP	   EP	  ILLIQUIDITY	REVS20	   SRMI	   VOL20
            date
            2016-06-24	0.165260	0.002198	0.085632	-0.078074	0.173832	0.214377	0.068445
            2016-06-27	0.165537	0.003583	0.063299	-0.048674	0.180890	0.202724	0.081748
            2016-06-28	0.135215	0.010403	0.059038	-0.034879	0.111691	0.122554	0.042489
            2016-06-29	0.068774	0.019848	0.058476	-0.049971	0.042805	0.053339	0.079592
            2016-06-30	0.039431	0.012271	0.037432	-0.027272	0.010902	0.077293	-0.050667
    """
    if ret_type is None:
        ret_type = "return"

    if not (ret_type in ["return", "upside_ret", "downside_ret"]):
        raise ValueError("不支持对%s收益的ic计算!支持的收益类型有return, upside_ret, downside_ret." % (ret_type,))

    ic_table = []
    sc = SignalCreator(
        price,
        high=high,
        low=low,
        group=group,
        benchmark_price=benchmark_price,
        period=period,
        n_quantiles=quantiles,
        mask=mask,
        can_enter=can_enter,
        can_exit=can_exit,
        forward=forward,
        commission=commisson
    )
    # 获取factor_value的时间(index),将用来生成 factors_ic_df 的对应时间(index)
    times = sorted(
        pd.concat([pd.Series(factors_dict[factor_name].index) for factor_name in factors_dict.keys()]).unique())
    for factor_name in factors_dict.keys():
        factors_dict[factor_name] = jutil.fillinf(factors_dict[factor_name])
        factor_value = factors_dict[factor_name]
        signal_data = sc.get_signal_data(factor_value)
        if ret_type in signal_data.columns:
            origin_fields = ["signal", ret_type]
            new_fields = ["signal", "return"]
            if group is not None:
                origin_fields.append("group")
                new_fields.append("group")
            signal_data = signal_data[origin_fields]
            signal_data.columns = new_fields
            ic = pd.DataFrame(pfm.calc_signal_ic(signal_data, group is not None))
            ic.columns = [factor_name, ]
            ic_table.append(ic)
        else:
            raise ValueError("signal_data中不包含%s收益,无法进行ic计算!" % (ret_type,))

    if group is None:
        ic_df = pd.concat(ic_table, axis=1).dropna(how="all").reindex(times)
    else:
        ic_df = pd.concat(ic_table, axis=1).dropna(how="all")
        ic_df = ic_df.reindex(pd.MultiIndex.from_product([times,ic_df.index.levels[1]],
                                                         names=["trade_date","group"]))
    return ic_df