Exemple #1
0
    def __init__(self):

        self.db = database_manager
        self.Q = SQL()

        self.Factor = FactorPool()  # 因子池
        self.Label = LabelPool()  # 标签池
        self.Stock = StockPool()  # 股票池

        self.factor_process = FactorProcess()  # 因子预处理

        self.ind = Indicator()  # 评价指标的计算

        self.factor_dict = {}  # 原始因子存储
        self.factor_dict_clean = {}  # 清洗后的因子存储

        self.data_input = {}  # 输入数据

        self.Finally_data = {}

        self.fact_test_result = collections.defaultdict(dict)  # 因子检验结果
        self.fact_inter_result = {}

        self.factor_mapping = self._factor_mapping()

        self.neu = 'non-neu'  # 中性化
Exemple #2
0
    def __init__(self):
        self.db = database_manager
        self.Q = SQL()

        self.fp = FactorProcess()  # 因子预处理
        self.Multi = Multicollinearity()  # 多因子处理

        self.factors_raw = None  # 原始因子集

        self.factor_D = {}  # 因子符号集
        self.factor_direction()
Exemple #3
0
    def __init__(self,
                 fac_exp: pd.DataFrame,
                 stock_ret: pd.Series,
                 ind_exp: pd.Series,
                 mv: pd.Series,
                 stock_weight: pd.Series = None,
                 ind_mv: pd.Series = None,
                 ind_weight: pd.Series = None,
                 fact_weight: pd.Series = None,
                 hp: int = 1):

        self.RET = ReturnModel()
        self.RISK = RiskModel()
        self.FP = FactorProcess()
        self.LP = LabelPool()
        self.Q = SQL()

        self.fac_exp = fac_exp  # 因子暴露
        self.stock_ret = stock_ret  # 股票收益标签
        self.ind_exp = ind_exp  # 行业标签
        self.mv = mv  # 流通市值
        self.hp = hp  # 持有周期

        self.stock_weight = stock_weight  # 指数个股权重约束
        self.ind_weight = ind_weight  # 指数行业权重
        self.ind_mv = ind_mv  # 指数行业市值
        self.fact_weight = fact_weight  # 因子暴露约束

        self.stock_weight_limit = (0.8, 1.2)  # 相对指数个股权重约束上下限
        self.ind_weight_limit = (0.8, 1.2)  # 相对指数个股行业权重约束上下限
        self.ind_mv_weight_limit = (0.7, 1.3)  # 相对指数个股行业市值约束上下限
        self.fact_weight_limit = (0.9, 1.1)  # 因子暴露约束上下限

        self.limit = []  # 约束条件
        self.bonds = []  # 权重约束条件
        self.const = []  # 约束子条件

        self.fact_name = self.fac_exp.columns  # 因子名称

        self.holding_ret = self._holding_return(stock_ret, hp)  # 持有期收益
        self.df_input = {}

        self.OPT_params = collections.defaultdict(dict)

        self.switch_format()
Exemple #4
0
 def __init__(self):
     self.Q = SQL()
     self.list_date = SQL().list_date_csv()
Exemple #5
0
class FactorBase(object):
    def __init__(self):
        self.Q = SQL()
        self.list_date = SQL().list_date_csv()

    # 财务数据转换,需要考虑未来数据
    def _switch_freq(self,
                     data_: pd.DataFrame,
                     name: str,
                     limit: int = 120,
                     date_sta: str = '20130101',
                     date_end: str = '20200401',
                     exchange: str = EN.SSE.value) -> pd.Series:
        """

        :param data_:
        :param name: 需要转换的财务指标
        :param limit: 最大填充时期,默认二个季度
        :param date_sta:
        :param date_end:
        :param exchange:
        :return:
        """
        def _reindex(data: pd.DataFrame, name_: str):
            """填充有风险哦"""
            # data_re = data.reindex(trade_date[KN.TRADE_DATE.value])
            data_re = pd.merge(data,
                               trade_date,
                               on=KN.TRADE_DATE.value,
                               how='outer')
            data_re.loc[:, data_re.columns !=
                        name_] = data_re.loc[:,
                                             data_re.columns != name_].fillna(
                                                 method='ffill')

            return data_re

        sql_trade_date = self.Q.trade_date_SQL(date_sta=date_sta,
                                               date_end=date_end,
                                               exchange=exchange)
        trade_date = self.Q.query(sql_trade_date)

        # 保留最新数据
        data_sub = data_.groupby(
            KN.STOCK_ID.value,
            group_keys=False).apply(lambda x: x.sort_values(by=[
                KN.TRADE_DATE.value, SN.REPORT_DATE.value
            ]).drop_duplicates(subset=[KN.TRADE_DATE.value], keep='last'))
        data_sub.reset_index(inplace=True)

        # 交易日填充
        data_trade_date = data_sub.groupby(KN.STOCK_ID.value,
                                           group_keys=False).apply(
                                               _reindex, name)
        res = data_trade_date.set_index(
            [KN.TRADE_DATE.value, KN.STOCK_ID.value]).sort_index()

        # 历史数据有限填充因子值
        res[name] = res[name].groupby(
            KN.STOCK_ID.value,
            group_keys=False).apply(lambda x: x.ffill(limit=limit))

        res.dropna(subset=[name], inplace=True)
        if 'index' in res.columns:
            res.drop(columns='index', inplace=True)
        return res

    # 读取因子计算所需常用数据
    def _csv_data(self,
                  data_name: list,
                  file_path: str = FPN.factor_inputData.value,
                  file_name: str = "FactorPool1",
                  date: str = KN.TRADE_DATE.value,
                  stock_id: str = KN.STOCK_ID.value):
        res = pd.read_csv(os.path.join(file_path, file_name + '.csv'),
                          usecols=[date, stock_id] + data_name)
        return res

    # 读取指数数据
    def csv_index(
        self,
        data_name: list,
        file_path: str = FPN.factor_inputData.value,
        file_name: str = 'IndexInfo',
        index_name: str = '',
        date: str = KN.TRADE_DATE.value,
    ):
        index_data = pd.read_csv(os.path.join(file_path, file_name + '.csv'),
                                 usecols=[date, 'index_name'] + data_name)
        res = index_data[index_data['index_name'] == index_name]
        return res

    # 读取分钟数据(数据不在一个文件夹中),返回回调函数结果
    def csv_HFD_data(self,
                     data_name: list,
                     func: Callable = None,
                     fun_kwargs: dict = {},
                     file_path: str = FPN.HFD_Stock_M.value,
                     sub_file: str = '') -> Dict[str, Any]:
        if sub_file == '':
            Path = file_path
        elif sub_file == '1minute':
            Path = FPN.HFD_Stock_M.value
        else:
            Path = os.path.join(file_path, sub_file)
        data_dict = {}
        file_names = os.listdir(Path)

        i = 1
        for file_name in file_names:
            i += 1
            if file_name[-3:] == 'csv':
                try:
                    data_df = pd.read_csv(os.path.join(Path, file_name),
                                          usecols=['code', 'time'] + data_name)
                except Exception as e:
                    continue
                data_df['date'] = file_name[:-4]
                data_df.rename(columns={'code': 'stock_id'}, inplace=True)
                res = func(data_df, **fun_kwargs)
                data_dict[file_name[:-4]] = res
            # if i == 3:
            #     break

        return data_dict

    def _switch_ttm(self, data_: pd.DataFrame, name: str):
        """
        计算TTM,groupby后要排序
        """
        def _pros_ttm(data_sub: pd.DataFrame, name_: str):
            data_sub[name_ + '_TTM'] = data_sub[name_].diff(1)
            res_ = data_sub[data_sub['M'] == '03'][name_].append(
                data_sub[data_sub['M'] != '03'][name_ + '_TTM'])
            res_ = res_.droplevel(
                level=KN.STOCK_ID.value).sort_index().rolling(4).sum()
            return res_

        data_copy = copy.deepcopy(data_)
        data_copy['M'] = data_copy[SN.REPORT_DATE.value].apply(
            lambda x: x[5:7])
        data_copy.set_index([SN.REPORT_DATE.value, KN.STOCK_ID.value],
                            inplace=True)
        data_copy.sort_index(inplace=True)

        res = data_copy[[name, 'M'
                         ]].groupby(KN.STOCK_ID.value).apply(_pros_ttm, name)

        res.index = res.index.swaplevel(0, 1)
        res.name = name
        return res
Exemple #6
0
class FactorValidityCheck(object):
    """
    对于单因子的有效性检验,我们从以下几个维度进行考量:
    1.单因子与下期收益率回归:
        1)因子T值序列绝对值平均值;
        2)因子T值序列绝对值大于2的占比;
        3)因子T值序列均值绝对值除以T值序列的标准差;
        4)因子收益率序列平均值;
        5)因子收益率序列平均值零假设T检验(判断因子收益率序列方向一致性和显著不为零)
    2.因子IC值:
        1)因子IC值序列的均值大小--因子显著性;
        2)因子IC值序列的标准差--因子稳定性;
        3)因子IR比率--因子有效性;
        4)因子IC值累积曲线--随时间变化效果是否稳定
        5)因子IC值序列大于零的占比--因子作用方向是否稳定
    3.分层回测检验单调性-打分法:
        行业内分层后再进行行业各层加权(沪深300行业权重)
        每层净值曲线
        每层相对基准净值曲线
        分年份收益
        1)年化收益率;
        2)年化波动率;
        3)夏普比率;
        4)最大回撤;
        5)胜率
    """
    columns = ['code', 'open', 'low', 'close', 'high']

    industry_list = [
        "CI005001.WI", "CI005002.WI", "CI005003.WI", "CI005004.WI",
        "CI005005.WI", "CI005006.WI", "CI005007.WI", "CI005008.WI",
        "CI005009.WI", "CI005010.WI", "CI005011.WI", "CI005012.WI",
        "CI005013.WI", "CI005014.WI", "CI005015.WI", "CI005016.WI",
        "CI005017.WI", "CI005018.WI", "CI005019.WI", "CI005020.WI",
        "CI005021.WI", "CI005022.WI", "CI005023.WI", "CI005024.WI",
        "CI005025.WI", "CI005026.WI", "CI005027.WI", "CI005028.WI",
        "CI005029.WI", "CI005030.WI"
    ]

    fact_name = None

    data_save_path = 'Data'

    parent_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))

    def __init__(self):

        self.db = database_manager
        self.Q = SQL()

        self.Factor = FactorPool()  # 因子池
        self.Label = LabelPool()  # 标签池
        self.Stock = StockPool()  # 股票池

        self.factor_process = FactorProcess()  # 因子预处理

        self.ind = Indicator()  # 评价指标的计算

        self.factor_dict = {}  # 原始因子存储
        self.factor_dict_clean = {}  # 清洗后的因子存储

        self.data_input = {}  # 输入数据

        self.Finally_data = {}

        self.fact_test_result = collections.defaultdict(dict)  # 因子检验结果
        self.fact_inter_result = {}

        self.factor_mapping = self._factor_mapping()

        self.neu = 'non-neu'  # 中性化

    # factor Chinese-English mapping
    def _factor_mapping(self, file_name: str = 'factor_name.json'):
        try:
            file_path = os.path.join(self.parent_path, file_name)
            infile = open(file_path, 'r', encoding='utf-8')
            res = json.load(infile)
        except Exception as e:
            print(f"read json file failed, error:{e}")
            res = {}
        return res

    # load stock pool and label pool
    @timer
    def load_pool_data(self,
                       stock_pool_name: str = 'StockPool1',
                       label_pool_name: str = 'LabelPool1'):
        """
        :param stock_pool_name: 股票池名称
        :param label_pool_name: 标签池名称
        :return:
        """
        # Load stock pool
        if stock_pool_name == '':
            print(
                f"{dt.datetime.now().strftime('%X')}: Can not load stock pool!"
            )
        else:
            try:
                stock_pool_method = self.Stock.__getattribute__(
                    stock_pool_name)
                effect_stock = stock_pool_method()
                # print(f"{dt.datetime.now().strftime('%X')}: Successfully generated stock pool")
            except Exception as e:
                print(e)
                print(
                    f"{dt.datetime.now().strftime('%X')}: Unable to load stock pool"
                )
            else:
                self.data_input['StockPool'] = effect_stock

        # Load label pool
        if label_pool_name == '':
            print(
                f"{dt.datetime.now().strftime('%X')}: Can not load label pool!"
            )
        else:
            try:
                label_pool_method = self.Label.__getattribute__(
                    label_pool_name)
                stock_label = label_pool_method()
                # print(f"{dt.datetime.now().strftime('%X')}: Successfully generated label pool")
            except Exception as e:
                print(e)
                print(
                    f"{dt.datetime.now().strftime('%X')}: Unable to load label pool"
                )
            else:
                self.data_input['LabelPool'] = stock_label

    # load factor
    @timer
    def load_factor(self, fact_name: str, **kwargs):
        """
        优先直接获取数据--否则数据库调取--最后实时计算
        :param fact_name:
        :param kwargs:
        :return:
        """
        if kwargs.get('factor_value', None) is None:
            # self.db.query_factor_data("EP_ttm", "Fin")
            if kwargs['cal']:
                try:
                    fact_raw_data = self.Factor.factor[
                        fact_name +
                        '_data_raw'](**kwargs['factor_params'])  # TODO
                    self.data_input["factor_raw_data"] = fact_raw_data
                except Exception as e:
                    print(e)
                    print(
                        f"{dt.datetime.now().strftime('%X')}: Unable to load raw data that to calculate factor!"
                    )
                    return
                else:
                    factor_class = self.Factor.factor[fact_name](
                        data=self.data_input["factor_raw_data"].copy(
                            deep=True),
                        **kwargs['factor_params'])
            else:
                factor_data_ = self.db.query_factor_data(
                    factor_name=fact_name, db_name=kwargs['db_name'])

                print(
                    f"{dt.datetime.now().strftime('%X')}: Get factor data from MySQL!"
                )
                factor_data_.set_index(
                    [KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True)

                factor_class = FactorInfo()
                factor_class.data = factor_data_[fact_name]
                factor_class.factor_name = fact_name
        else:
            print(
                f"{dt.datetime.now().strftime('%X')}: Get factor data from input!"
            )
            kwargs['factor_value'].set_index(
                [KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True)

            factor_class = FactorInfo()
            factor_class.data = kwargs['factor_value'][fact_name]
            factor_class.factor_name = fact_name

        self.fact_name = factor_class.factor_name
        self.factor_dict[self.fact_name] = factor_class

    def process_factor(self, data: pd.Series, outliers: str,
                       neutralization: str, standardization: str):
        """
        :param data:
        :param outliers: 异常值处理
        :param neutralization: 中心化处理
        :param standardization: 标准化处理

        :return:
        """
        if data is None:
            factor_raw = self.factor_dict[self.fact_name].data.copy(
                deep=True)  # 获取因子数据
        else:
            factor_raw = data.copy(deep=True)

        if factor_raw is None:
            print("factor data is None!")
            return

        factor_raw = factor_raw[self.fact_name] if isinstance(
            factor_raw, pd.DataFrame) else factor_raw

        # pre-processing factors
        if outliers + neutralization + standardization == '':
            self.factor_dict_clean[self.fact_name] = factor_raw
            self.neu = 'non-neu'
        else:
            try:
                self.factor_dict_clean[
                    self.fact_name] = self.factor_process.main(
                        factor=factor_raw,
                        outliers=outliers,
                        neutralization=neutralization,
                        standardization=standardization)
                self.neu = 'neu'
            except Exception as e:
                print(e)
                print(
                    f"{dt.datetime.now().strftime('%X')}: pre-processing factors error!"
                )
                return

    # Data Integration
    @timer
    def integration(
        self,
        outliers: str,
        neu: str,
        stand: str,
        switch_freq: bool = False,
        limit: int = 120,
    ):
        """
        :param outliers: 异常值处理
        :param neu: 中心化处理
        :param stand: 标准化处理
        :param switch_freq: 数据频率的转换
        :param limit: 数据填充长度
        :return:
        """
        # Integration
        SP, LP = self.data_input.get("StockPool", None), self.data_input.get(
            'LabelPool', None)

        FP = self.factor_dict[self.fact_name].data.copy(deep=True)
        # 数据频率的转换
        if switch_freq:
            FP = FactorBase()._switch_freq(data_=FP,
                                           name=self.fact_name,
                                           limit=limit)

        #  Label Pool and Factor Pool intersection with Stock Pool, respectively
        self.Finally_data["Strategy"] = pd.concat([FP.reindex(SP), LP], axis=1)

        # process factor
        self.process_factor(data=self.Finally_data["Strategy"][self.fact_name],
                            outliers=outliers,
                            neutralization=neu,
                            standardization=stand)

        self.Finally_data["Strategy"][self.fact_name] = self.factor_dict_clean[
            self.fact_name]

        self.Finally_data["Strategy"].dropna(how='all', inplace=True)

        # get benchmark
        # if bm == 'all':
        #     self.Finally_data["BenchMark"] = LP[KN.STOCK_RETURN.value +
        #                                         '_' +
        #                                         PVN.OPEN.value].groupby(KN.TRADE_DATE.value).mean().shift(1).sort_index()
        # else:
        #     self.Finally_data['BenchMark'] = self.Label.BenchMark(bm_index=bm)

    # Factor validity test
    @timer
    def effectiveness(self,
                      hp: int = 1,
                      ret_name: str = PVN.OPEN.value,
                      pool_type: str = 'all',
                      group_num: int = 5,
                      save: bool = True):
        """
        因子计算周期和调仓期需要体现出来
        """
        data_clean = self.Finally_data["Strategy"].copy(deep=True)

        fact_exposure = copy.deepcopy(data_clean[self.fact_name])
        stock_return = copy.deepcopy(data_clean[KN.STOCK_RETURN.value + '_' +
                                                ret_name])
        stock_return.name = KN.STOCK_RETURN.value
        industry_exposure = copy.deepcopy(data_clean[SN.INDUSTRY_FLAG.value])
        index_weight = copy.deepcopy(
            data_clean[SN.CSI_500_INDUSTRY_WEIGHT.value])
        # benchmark = self.Finally_data['BenchMark'].copy(deep=True)
        liq_mv = data_clean[PVN.LIQ_MV.value].copy(deep=True)

        # 检验
        try:
            eff1 = self.factor_return(fact_exposure=fact_exposure,
                                      stock_return=stock_return,
                                      industry_exposure=industry_exposure,
                                      hp=hp,
                                      mv=liq_mv,
                                      save=save)

            eff2 = self.IC_IR(fact_exposure=fact_exposure,
                              stock_return=stock_return,
                              hp=hp,
                              save=save)

            eff3 = self.monotonicity(
                fact_exposure=fact_exposure,
                stock_return=stock_return,
                # benchmark=benchmark,
                industry_exposure=industry_exposure,
                index_weight=index_weight,
                hp=hp,
                group_num=group_num,
                save=save)
        except Exception as e:
            print(e)
        else:
            if eff1 is not None and eff2 is not None and save:
                eff1.name = eff1.name + f'_{hp}days'
                eff2.name = eff2.name + f'_{hp}days'
                eff3.name = eff3.name + f'_{hp}days'

                if self.neu == 'neu':
                    self.to_csv(FPN.factor_test_res.value, 'Correlation_neu',
                                eff1.append(eff2))
                    self.to_csv(FPN.factor_test_res.value, 'Group_neu', eff3)
                else:
                    self.to_csv(FPN.factor_test_res.value, 'Correlation',
                                eff1.append(eff2))
                    self.to_csv(FPN.factor_test_res.value, 'Group', eff3)

    # 单因子与下期收益率回归
    def factor_return(self,
                      fact_exposure: pd.Series,
                      stock_return: pd.Series,
                      industry_exposure: pd.DataFrame,
                      mv: pd.Series,
                      hp: int = 1,
                      **kwargs) -> [pd.Series, None]:
        """

        :param fact_exposure:
        :param stock_return:
        :param industry_exposure:
        :param mv:
        :param hp:
        :param kwargs:
        :return:
        """

        # Calculate stock returns for different holding periods and generate return label
        return_label = self._holding_return(stock_return, hp)

        df_data = pd.concat(
            [return_label, industry_exposure, fact_exposure, mv],
            axis=1,
            join='inner').dropna().sort_index()

        # Analytic regression result:T Value and Factor Return
        res_reg = df_data.groupby(KN.TRADE_DATE.value).apply(
            self._reg_fact_return, 150)
        res_reg.dropna(how='all', inplace=True)
        if res_reg.empty:
            print(f"{self.fact_name}因子每期有效样本量不足1500,无法检验!")
            return None

        # get Trade date
        td = self.Q.trade_date_csv()
        res_reg = res_reg.reindex(
            td[(td['date'] >= res_reg.index[0])
               & (td['date'] <= res_reg.index[-1])]['date'])

        # Calculate Indicators
        T_mean = res_reg['T'].mean()
        T_abs_mean = abs(res_reg['T']).mean()
        T_abs_up_2 = res_reg['T'][
            abs(res_reg['T']) > 2].count() / res_reg.dropna().shape[0]
        T_stable = abs(res_reg['T'].mean()) / res_reg['T'].std()

        fact_ret_mean = res_reg['factor_return'].mean()
        ret_ttest = stats.ttest_1samp(res_reg['factor_return'].dropna(), 0)

        test_indicators = pd.Series([
            T_abs_mean, T_abs_up_2, T_mean, T_stable, fact_ret_mean,
            ret_ttest[0]
        ],
                                    index=[
                                        'T_abs_mean', 'T_abs_up_2', 'T_mean',
                                        'T_stable', 'fact_ret', 'fact_ret_t'
                                    ],
                                    name=self.fact_name)

        # 因子收益路径依赖处理
        fact_ret_path = self.cor_mean(res_reg['factor_return'], hp=hp)
        # test_reg = np.arange(hp - 1, res_reg['factor_return'].shape[0], hp)
        # plot
        self.plot_return(fact_ret=fact_ret_path, hp=hp, save=kwargs['save'])

        # save data to dict
        self.fact_test_result[self.fact_name]['reg'] = {
            "res": res_reg,
            "ind": test_indicators
        }
        # save result to local
        # if kwargs['save']:
        #     self.factor_return_to_sql(fact_ret=res_reg, ret_type='Pearson', hp=hp)

        return test_indicators

    # 因子IC值
    def IC_IR(self,
              fact_exposure: pd.Series,
              stock_return: pd.Series,
              hp: int = 1,
              **kwargs):

        # Calculate stock returns for different holding periods and generate return label
        return_label = self._holding_return(stock_return, hp)

        df_data = pd.concat([return_label, fact_exposure],
                            axis=1,
                            join='inner').sort_index()

        IC = df_data.groupby(KN.TRADE_DATE.value).apply(
            lambda x: x.corr(method='spearman').iloc[0, 1])
        IC.dropna(inplace=True)

        # get Trade date
        td = self.Q.trade_date_csv()
        IC = IC.reindex(td[(td['date'] >= IC.index[0])
                           & (td['date'] <= IC.index[-1])]['date'])
        IC_mean, IC_std = IC.mean(), IC.std()
        IR = IC_mean / IC_std
        IC_up_0 = len(IC[IC > 0]) / IC.dropna().shape[0]
        IC_cum = IC.fillna(0).cumsum()

        test_indicators = pd.Series(
            [IC_mean, IC_std, IR, IC_up_0],
            index=['IC_mean', 'IC_std', 'IR', 'IC_up_0'],
            name=self.fact_name)
        # save data to dict
        self.fact_test_result[self.fact_name]['IC'] = {
            "res": IC,
            "ind": test_indicators
        }

        IC_path = self.cor_mean(IC, hp=hp)
        # plot
        self.plot_IC(IC=IC_path,
                     IC_cum=IC_path.fillna(0).cumsum(),
                     hp=hp,
                     save=kwargs['save'])

        # save result to local
        # if kwargs['save']:
        #     self.factor_return_to_sql(fact_ret=IC.to_frame('factor_return'), ret_type='Spearman', hp=hp)

        return test_indicators

    # 分层回测检验  TODO 净值起始点不为1
    def monotonicity(
            self,
            fact_exposure: pd.Series,
            stock_return: pd.Series,
            # benchmark: pd.Series,
            industry_exposure: pd.DataFrame,
            index_weight: pd.Series,
            hp: int = 1,
            group_num: int = 5,
            **kwargs):
        """
        :param benchmark:
        :param fact_exposure:
        :param stock_return:
        :param industry_exposure:
        :param index_weight:
        :param hp:
        :param group_num: 分组数量
        :return:
        """

        # Grouping
        df_data = pd.concat(
            [stock_return, fact_exposure, industry_exposure, index_weight],
            axis=1,
            join='inner').dropna(how='any').sort_index()

        df_data['group'] = df_data.groupby(
            SN.INDUSTRY_FLAG.value,
            group_keys=False).apply(lambda x: self.grouping(
                x[self.fact_name].unstack(), group_num).stack())

        # benchmark return
        # bm_ret = benchmark.sort_index()
        # bm_ret = bm_ret.loc[df_data.index[0][0]:]
        # bm_nav = (bm_ret.fillna(0) + 1).cumprod()
        # bm_nav.index = pd.DatetimeIndex(bm_nav.index)
        # bm_nav.name = 'ALL'

        # 计算平均组收益
        df_group_ret = self.group_return(df_data,
                                         hp=hp,
                                         index_weight_name=index_weight.name)
        ################################################################################################################
        # 合成净值曲线
        nav = df_group_ret.add(1).cumprod(axis=0)
        # nav = nav.merge(bm_nav, on=KN.TRADE_DATE.value, how='left')
        ex_nav = nav.div(nav['ALL'], axis=0).drop(columns='ALL')

        # 计算指标
        ind_year = nav.apply(
            lambda x: x.groupby(x.index.year).apply(self.ind_cal, freq="D"))
        ind_nav = nav.apply(self.ind_cal, freq="D")
        ind_nav = ind_nav.stack()
        ind_nav.name = self.fact_name
        ################################################################################################################
        # save data to dict
        self.fact_test_result[self.fact_name]['Group'] = {
            "res": nav,
            "ind": ind_nav
        }

        # plot
        self.plot_monotonicity(nav=nav.copy(deep=True),
                               ex_nav=ex_nav.copy(deep=True),
                               ind_year=ind_year.copy(deep=True),
                               hp=hp,
                               save=kwargs['save'])

        # save data to MySQL
        # if kwargs['save']:
        #     self.monotonicity_to_sql(df_group_ret=df_group_ret, df_data=df_data, hp=hp)

        return ind_nav

    """因子数据保存"""

    # 因子收益入库(Pearson相关性和Spearman相关性)
    @timer
    def factor_return_to_sql(self, **kwargs):
        factor_ret, ret_type, hp = kwargs['fact_ret'], kwargs[
            'ret_type'], kwargs['hp']

        df = factor_ret.dropna(axis=0, how='all').copy()

        def encapsulation(df_: pd.DataFrame) -> Iterable:
            df_sub = df.where(df_.notnull(), None)
            i = 1
            for index_, row_ in df_sub.iterrows():
                # i += 1
                # if i > 2300:
                #     break
                R = FactorRetData()
                R.date = dt.datetime.strptime(index_, "%Y-%m-%d")
                R.factor_T = row_['T'] if ret_type == 'Pearson' else None
                R.holding_period = hp
                R.factor_return = row_['factor_return']
                R.factor_name = self.fact_name
                R.factor_name_chinese = self.factor_mapping[self.fact_name]
                R.ret_type = ret_type
                yield R

        ret_generator = encapsulation(df)

        if self.db.check_fact_ret_data(self.fact_name):
            print(
                f"This field {self.fact_name} exists in MySQL database dbfactorretdata and will be overwritten"
            )

        self.db.save_fact_ret_data(ret_generator)

    # 因子分层数据入库
    @timer
    def monotonicity_to_sql(self, **kwargs):
        def encapsulation(df: pd.DataFrame) -> Iterable:
            df_sub = df.where(df.notnull(), None)
            i = 1
            for index_, row_ in df_sub.iterrows():
                i += 1
                if i > 2300:
                    break
                G = GroupData()
                G.stock_id = row_[KN.STOCK_ID.value]
                G.date = index_[0].to_pydatetime()
                G.stock_return = row_[KN.STOCK_RETURN.value]
                G.factor_value = row_[self.fact_name]
                G.factor_name = self.fact_name
                G.holding_period = hp
                G.factor_name_chinese = self.factor_mapping[self.fact_name]
                G.group = index_[1]
                G.industry = row_[SN.INDUSTRY_FLAG.value]
                G.factor_type = self.factor_dict[self.fact_name].factor_type
                yield G

        # 封装数据,返回迭代器
        df_group_ret, df_data, hp = kwargs['df_group_ret'], kwargs[
            'df_data'], kwargs['hp']
        df_1, df_2 = copy.deepcopy(df_group_ret), copy.deepcopy(df_data)
        df_1.columns = [col_.split("_")[-1] for col_ in df_1.columns]
        df_1 = df_1.stack()
        df_1.index.names = [KN.TRADE_DATE.value, 'group']

        df_2 = df_2.dropna()
        df_2['group'] = df_2['group'].astype(int).astype(str)
        df_2 = df_2.reset_index(KN.STOCK_ID.value)
        df_2.index = pd.DatetimeIndex(df_2.index)
        df_2 = df_2.set_index(['group'], append=True)
        df_2['group_return'] = df_1

        group_generator = encapsulation(df_2)
        # TODO
        if self.db.check_group_data(self.fact_name):
            print(
                f"This field {self.fact_name} exists in MySQL database dbgroupdata and will be overwritten"
            )

        self.db.save_group_data(group_generator)

    # 因子值入库
    @timer
    def factor_to_sql(self,
                      db_name: str,
                      folder_name: str = '',
                      save_type: str = 'raw'):
        def encapsulation(fac: FactorInfo) -> Iterable:
            data_sub = fac.data.where(fac.data.notnull(), None)
            i = 1
            for index_, row_ in data_sub.iterrows():
                # i += 1
                # if i > 10000:
                #     break
                F = FactorData()

                F.stock_id = row_[KN.STOCK_ID.value]
                F.date = row_[KN.TRADE_DATE.value]
                F.date_report = row_[SN.REPORT_DATE.value]  # TODO 报告期

                F.factor_name = self.fact_name
                F.factor_value = row_[self.fact_name]
                F.factor_name_chinese = self.factor_mapping[self.fact_name]

                F.factor_category = fac.factor_category
                F.factor_type = fac.factor_type
                yield F

        factor = self.factor_dict[self.fact_name]

        # check

        # if factor.data_raw.shape[0] >= 1e6:
        #     print("数据量太大,请从本地导入,数据将以CSV形式存储!")
        if save_type == 'raw':
            df_ = copy.deepcopy(
                factor.data_raw)  # df_ = copy.deepcopy(factor.data_raw)
            path_ = FPN.FactorRawData.value
        elif save_type == 'switch':
            df_ = copy.deepcopy(factor.data)
            path_ = FPN.FactorSwitchFreqData.value
        else:
            print("factor_type error!}")
            return
        # df_['factor_category'] = factor.factor_category
        # df_['factor_name'] = factor.factor_name
        # df_['factor_type'] = factor.factor_type
        # df_['factor_name_chinese'] = self.factor_mapping[self.fact_name]
        # df_.rename(columns={self.fact_name: 'factor_value',
        #                     SN.REPORT_DATE.value: 'date_report'},
        #            inplace=True)
        file_path = os.path.join(path_, folder_name)
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        df_.to_csv(os.path.join(file_path, f'{self.fact_name}.csv'))

        return

        factor_generator = encapsulation(factor)

        # if self.db.check_factor_data(self.fact_name, db_name):
        #     print(f"This field '{self.fact_name}' exists in MySQL database 'dbfactordata' and will be overwritten")
        # else:
        print(
            f"Factor: '{self.fact_name}' is going to be written to MySQL database 'dbfactordata'"
        )
        self.db.save_factor_data(factor_generator, db_name)

    @timer
    def factor_to_csv(self):

        factor = self.factor_dict[self.fact_name]
        file_path = os.path.join(FPN.FactorRawData.value,
                                 factor.factor_category)

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        data_path = os.path.join(file_path, factor.factor_name + '.csv')
        factor.data.to_csv(data_path, header=True)

    """画图"""

    # 因子收益累积曲线图
    def plot_return(self, **kwargs):
        fact_ret, hp = kwargs['fact_ret'], kwargs['hp']
        cum_return = fact_ret.fillna(0).cumsum()

        f, ax = plt.subplots(figsize=(12, 8))
        sns.set(font_scale=1.4)

        fact_ret.plot(kind='bar', label="fact_return", legend=True, grid=False)
        ax.xaxis.set_major_locator(plt.MultipleLocator(100))

        cum_return.plot(
            label="cum return",
            color='red',
            title=f'Factor: {self.fact_name}-{hp}days-{self.neu} Fact_Return',
            secondary_y=True,
            legend=True,
            grid=False,
            rot=60)

        if kwargs['save']:
            # print(f"{dt.datetime.now().strftime('%X')}: Save Cum Return result figure")
            plt.savefig(os.path.join(
                FPN.factor_test_res.value,
                f"{self.fact_name}_cum_return-{hp}days-{self.neu}.png"),
                        dpi=200,
                        bbox_inches='tight')

        plt.show()

    # 因子与个股收益秩相关系数累积图
    def plot_IC(self, **kwargs):
        IC, IC_cum, hp = kwargs['IC'], kwargs['IC_cum'], kwargs['hp']

        sns.set(font_scale=1.4)
        f, ax = plt.subplots(figsize=(12, 8))

        IC.plot(kind='bar',
                color='blue',
                label="IC",
                title=f'Factor: {self.fact_name}-{hp}days-{self.neu} IC_Value',
                legend=True,
                grid=False)

        IC_cum.plot(color='red',
                    label="IC_Mean",
                    legend=True,
                    grid=False,
                    secondary_y=True,
                    rot=60)
        ax.xaxis.set_major_locator(plt.MultipleLocator(100))

        # save IC result figure
        if kwargs['save']:
            # print(f"{dt.datetime.now().strftime('%X')}: Save IC result figure")
            plt.savefig(os.path.join(
                FPN.factor_test_res.value,
                f"{self.fact_name}_IC_Value-{hp}days-{self.neu}.png"),
                        dpi=200,
                        bbox_inches='tight')

        plt.show()

    # 分层结果
    def plot_monotonicity(self, **kwargs):
        nav, ex_nav, ind_year, hp = kwargs['nav'], kwargs['ex_nav'], kwargs[
            'ind_year'], kwargs['hp']

        nav.index = nav.index.map(lambda x: x.strftime('%Y-%m-%d'))
        ex_nav.index = ex_nav.index.map(lambda x: x.strftime('%Y-%m-%d'))

        sns.set(font_scale=1)

        fig = plt.figure(figsize=(12, 10))
        ax1 = fig.add_subplot(3, 2, 1)
        nav.plot(rot=30,
                 ax=ax1,
                 label='nav',
                 title=f'{self.fact_name}: nav-{hp}days-{self.neu}',
                 legend=True)

        ax2 = fig.add_subplot(3, 2, 2)
        ex_nav.plot(rot=30,
                    ax=ax2,
                    label='nav',
                    title=f'{self.fact_name}: nav_ex_bm-{hp}days-{self.neu}',
                    legend=True)

        ax3 = fig.add_subplot(3, 2, 3)
        ind_year.xs('ret_a', level=1).plot.bar(
            rot=0,
            ax=ax3,
            label='return',
            title=f'{self.fact_name}: group return-{self.neu}',
            legend=False)
        ax4 = fig.add_subplot(3, 2, 4)
        ind_year.xs('std_a', level=1).plot.bar(
            rot=0,
            ax=ax4,
            label='std',
            title=f'{self.fact_name}: group return std-{self.neu}',
            legend=False)
        ax5 = fig.add_subplot(3, 2, 5)
        ind_year.xs('shape_a', level=1).plot.bar(
            rot=0,
            ax=ax5,
            label='shape_a',
            title=f'{self.fact_name}: group shape ratio-{self.neu}',
            legend=False)
        ax6 = fig.add_subplot(3, 2, 6)
        ind_year.xs('max_retreat', level=1).plot.bar(
            rot=0,
            ax=ax6,
            label='max_retreat',
            title=f'{self.fact_name}: group max retreat-{self.neu}',
            legend=False)

        # save nav result figure
        if kwargs['save']:
            # print(f"{dt.datetime.now().strftime('%X')}: Save nav result figure")
            plt.savefig(os.path.join(
                FPN.factor_test_res.value,
                f"{self.fact_name}_nav-{hp}days-{self.neu}.png"),
                        dpi=300,
                        bbox_inches='tight')
        plt.show()

    # cal ind
    def ind_cal(self, nav: pd.Series, freq: str = "D"):

        ret_a = self.ind.return_a(nav, freq=freq)
        std_a = self.ind.std_a(nav, freq=freq)
        shape_a = self.ind.shape_a(nav, freq=freq)
        max_retreat = self.ind.max_retreat(nav)

        test_indicators = pd.Series(
            [ret_a, std_a, shape_a, max_retreat],
            index=['ret_a', 'std_a', 'shape_a', 'max_retreat'],
            name=self.fact_name)
        return test_indicators

    # Series additional written
    def to_csv(self, path: str, file_name: str, data_: pd.Series):
        data_path_ = os.path.join(path, file_name + '.csv')
        data_df = data_.to_frame().T

        header = False if os.path.exists(data_path_) else True

        data_df.to_csv(data_path_, mode='a', header=header)

    def _reg_fact_return(self,
                         data_: pd.DataFrame,
                         num: int = 150) -> object or None:  # TODO 考虑回归失败
        """
        需要考虑个股收益波动较大带来的问题,对收益率进行去极值,极端值对最小二乘法影响较大,去除极值会使得回归系数相对平稳点
        返回回归类
        """
        data_sub = data_.sort_index().dropna(how='any')
        # print(f"有效样本量{data_sub.shape[0]}")
        if data_sub.shape[0] < num:
            res = pd.Series(index=['T', 'factor_return'])
        else:
            # if data_sub.index[0][0] in ['2015-03-23']:
            #     print('s')
            # data_sub = data_sub[data_sub[KN.STOCK_RETURN.value] <= 0.09]
            # data_sub_ = data_sub[KN.STOCK_RETURN.value]
            data_sub[KN.STOCK_RETURN.value] = self.factor_process.mad(
                data_sub[KN.STOCK_RETURN.value])
            # data_sub['return'] = self.factor_process.z_score(data_sub['return'])
            data_sub = data_sub.dropna()

            mv = data_sub[PVN.LIQ_MV.value]
            d_ = data_sub.loc[:, data_sub.columns != PVN.LIQ_MV.value]
            X = pd.get_dummies(d_.loc[:, d_.columns != KN.STOCK_RETURN.value],
                               columns=[SN.INDUSTRY_FLAG.value])
            # Y = np.sign(d_[KN.STOCK_RETURN.value]) * np.log(abs(d_[KN.STOCK_RETURN.value]))
            # Y.fillna(0, inplace=True)
            Y = d_[KN.STOCK_RETURN.value]
            reg = sm.WLS(Y, X,
                         weights=pow(mv, 0.5)).fit(cov_type='HC1')  # 流通市值平方根加权
            # reg = sm.OLS(Y, X).fit(cov_type='HC2')

            if np.isnan(reg.rsquared_adj):
                res = pd.Series(index=['T', 'factor_return'])
            else:
                res = pd.Series(
                    [reg.tvalues[self.fact_name], reg.params[self.fact_name]],
                    index=['T', 'factor_return'])
        return res

    @staticmethod
    def _holding_return(ret: pd.Series, holding_period: int = 1) -> pd.Series:
        """
        计算持有不同周期的股票收益率
        :param ret: 股票收益率序列
        :param holding_period: 持有周期
        :return:
        """

        ret_sub = copy.deepcopy(ret)

        # Holding period return
        ret_sub = ret_sub.add(1)

        ret_label = 1
        for shift_ in range(holding_period):
            ret_label *= ret_sub.groupby(KN.STOCK_ID.value).shift(-shift_)

        ret_label = ret_label.sub(1)

        # Remove invalid value
        # ret_label = ret_comp.groupby(KN.STOCK_ID.value, group_keys=False).apply(lambda x: x[holding_period - 1:])

        # The tag
        # ret_label = ret_comp.groupby(KN.STOCK_ID.value).apply(lambda x: x.shift(- holding_period))

        return ret_label

    # 分组
    @staticmethod
    def grouping(data: pd.DataFrame, n):
        """
        1.假设样本量为M,将因子分成N组,前N-1组有效样本量为int(M/N),最后一组有效样本量为M-(N-1)*int(M/*N);
        2.无效样本不参与计算;
        3.相同排序定义为同一组;
        4.相同排序后下一元素连续不跳级
        5.升序排列
        :param data:
        :param n:分组个数
        :return:
        """
        rank_data = data.rank(axis=1, ascending=True, method='dense')
        effect_data = rank_data.max(axis=1)
        amount_each_group = effect_data // n
        data_group = rank_data.floordiv(amount_each_group, axis=0) + np.sign(
            rank_data.mod(amount_each_group, axis=0))
        data_group[data_group > n] = n
        return data_group

    """多路径取平均"""

    # 考虑路径依赖,多路径取平均
    def group_return(
        self,
        data: pd.DataFrame,
        hp: int = 1,
        index_weight_name: str = SN.CSI_300_INDUSTRY_WEIGHT.value
    ) -> pd.DataFrame:
        """
        :param data:
        :param hp:
        :param index_weight_name:
        :return:
        """
        group_ = data[SN.GROUP.value].unstack().sort_index()
        # 防止存在交易日缺失
        td = self.Q.trade_date_csv()
        group_ = group_.reindex(td[(td['date'] >= group_.index[0])
                                   & (td['date'] <= group_.index[-1])]['date'])
        # td = self.Q.query(self.Q.trade_date_SQL(date_sta=group_.index[0].replace('-', ''),
        #                                         date_end=group_.index[-1].replace('-', '')))
        # group_ = group_.reindex(td[KN.TRADE_DATE.value])
        # The average in the group and weighting of out-of-group CSI 300 industry weight, consider return period
        res_cont_ = []
        for i in range(0, hp):
            # group_0 = pd.DataFrame(index=group_.index, columns=group_.columns, data=0)
            group_copy = copy.deepcopy(group_)
            data_ = copy.deepcopy(data)

            array1 = np.arange(0, group_copy.shape[0], 1)
            array2 = np.arange(i, group_copy.shape[0], hp)
            row_ = list(set(array1).difference(array2))

            # 非调仓期填为空值
            group_copy.iloc[row_] = group_copy.iloc[row_].replace(
                range(int(max(data_[SN.GROUP.value].dropna())) + 1), np.nan)

            if hp != 1:  # TODO 优化
                group_copy.fillna(method='ffill', inplace=True, limit=hp - 1)
            # rep = group_.replace(range(int(max(data_[SN.GROUP.value])) + 1), 0)

            # 原空值依然设为空值
            # group_sub = group_copy.sub(rep)

            # 替换原组别并进行收益率的计算
            data_[SN.GROUP.value] = group_copy.stack()

            ind_weight = data_.groupby(
                [KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value,
                 SN.GROUP.value]).mean()

            ind_weight['return_weight'] = ind_weight[KN.STOCK_RETURN.value] * \
                                          ind_weight[index_weight_name]

            group_return = ind_weight.groupby(
                [KN.TRADE_DATE.value, SN.GROUP.value]).sum()

            res_cont_.append(group_return['return_weight'])  # 加权后收益率
        # 取平均
        res_ = reduce(lambda x, y: x + y,
                      res_cont_).div(hp).unstack().fillna(0)

        res_.columns = [f'G_{int(col_)}' for col_ in res_.columns]  # rename
        res_['ALL'] = res_.mean(axis=1)
        res_.index = pd.DatetimeIndex(res_.index)

        return res_

    # 考虑路径依赖,多路径取平均
    def cor_mean(self, data: pd.DataFrame, hp: int = 1) -> pd.DataFrame:

        data_copy = data.copy(deep=True)
        data_index = data_copy.index

        res_cont_ = []
        for i in range(0, hp):
            array1 = np.arange(i, data_copy.shape[0], hp)

            # 非调仓期填为空值
            data_copy_ = data_copy.iloc[list(array1)].reindex(data_index)

            if hp != 1:
                data_copy_.fillna(method='ffill', inplace=True, limit=hp - 1)

            res_cont_.append(data_copy_)

        res_ = reduce(lambda x, y: x + y, res_cont_).div(hp).fillna(0)

        return res_
Exemple #7
0
class FactorCollinearity(object):
    """
    目前只考虑线性相关性

    多因子模型中,按照因子的属性类别,将因子划分为大类内因子和大类间因子,
    一般认为大类内因子具有相同的属性,对个股收益有着相似的解释,
    大类间因子具有不同的因子属性和意义,对个股收益解释不同,
    所以基于此:
    1.大类内因子考虑采用合成的方式对相关性强的因子进行复合
        复合方式:等权法,
                 历史收益率加权法(等权,半衰加权),
                 历史信息比率加权法(等权,半衰加权),
                 最大化复合IC/IC_IR加权,主成分分析等
    2.大类间相关性强的因子考虑采用取舍,剔除相关性强的因子
    注:
    1.对于符号相反的因子采用复合方式合成新因子时,需要调整因子的符号使因子的符号保持一致
    2.采用历史收益率或IC对因子进行加权时,默认情况下认为所有交易日都存在,可考虑对因子和权重进行日期重塑,避免数据错位
    3.在进行因子标准化处理时默认采用多个因子取交集的方式,剔除掉因子值缺失的部分
    """

    parent_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))

    def __init__(self):
        self.db = database_manager
        self.Q = SQL()

        self.fp = FactorProcess()  # 因子预处理
        self.Multi = Multicollinearity()  # 多因子处理

        self.factors_raw = None  # 原始因子集

        self.factor_D = {}  # 因子符号集
        self.factor_direction()

    # factor direction mapping  TODO 后续改成时间序列,方便不同时期的因子合成
    def factor_direction(self, file_name: str = 'factor_direction.json'):
        try:
            file_path = os.path.join(self.parent_path, file_name)
            infile = open(file_path, 'r', encoding='utf-8')
            self.factor_D = json.load(infile)
        except Exception as e:
            print(f"read json file failed, error:{e}")
            self.factor_D = {}

    # 获取因子数据
    def get_data(self,
                 folder_name: str = '',
                 factor_names: dict = None,
                 factors_df: pd.DataFrame = None):
        """
        数据来源:
        1.外界输入;
        2.路径下读取csv
        :param factor_names:
        :param folder_name:
        :param factors_df:
        :return:
        """
        if factors_df is None:
            try:
                factors_path = os.path.join(FPN.FactorSwitchFreqData.value,
                                            folder_name)
                if factor_names:
                    factor_name_list = list(
                        map(lambda x: x + '.csv', factor_names))
                else:
                    factor_name_list = os.listdir(factors_path)
            except FileNotFoundError:
                print(
                    f"Path error, no folder name {folder_name} in {FPN.factor_ef.value}!"
                )
            else:
                factor_container = []
                # 目前只考虑csv文件格式
                for factor_name in factor_name_list:
                    if factor_name[-3:] != 'csv':
                        continue
                    data_path = os.path.join(factors_path, factor_name)
                    print(f"Read factor data:{factor_name[:-4]}")
                    factor_data = pd.read_csv(
                        data_path,
                        index_col=[KN.TRADE_DATE.value, KN.STOCK_ID.value])
                    factor_container.append(factor_data[factor_name[:-4]])

                if not factor_container:
                    print(f"No factor data in folder {folder_name}!")
                else:
                    self.factors_raw = pd.concat(factor_container, axis=1)
        else:
            self.factors_raw = factors_df.set_index(
                [KN.TRADE_DATE.value, KN.STOCK_ID.value])

    # 相关性检验

    def correctionTest(self):
        COR = self.Multi.correlation(self.factors_raw)

        print('S')
        pass

    #  因子合成
    def factor_synthetic(self,
                         method: str = 'Equal',
                         factor_D: dict = None,
                         stand_method: str = 'z_score',
                         ret_type: str = 'Pearson',
                         **kwargs):
        """
        因子复合需要考虑不同因子在不同的截面数据缺失情况,对于当期存在缺失因子时,复合因子取有效因子进行加权,而不是剔除
        :param method:
        :param factor_D:
        :param stand_method:
        :param ret_type:
        :param kwargs:
        :return:
        """
        # 更新因子符号
        if factor_D is not None:
            self.factor_D.update(factor_D)

        # 全量处理,滚动处理后续再补
        if method != 'Equal':
            if kwargs.get('fact_ret', None) is None:
                factor_name_tuple = tuple(self.factors_raw.columns)
                fact_ret = self.factor_ret_from_sql(factor_name_tuple,
                                                    hp=kwargs['hp'],
                                                    ret_type=ret_type)
            else:
                factor_name_tuple = tuple(kwargs['fact_ret'].columns)
                fact_ret = kwargs['fact_ret']

            if len(fact_ret['factor_name'].drop_duplicates()) < len(
                    factor_name_tuple):
                print(f"因子{ret_type}收益数据缺失,无法进行计算")
                return

            kwargs['fact_ret'] = fact_ret.pivot_table(
                values='factor_return',
                index=KN.TRADE_DATE.value,
                columns='factor_name')
            # 交易日修正
            td = self.Q.query(
                self.Q.trade_date_SQL(
                    date_sta=kwargs['fact_ret'].index[0].replace('-', ''),
                    date_end=kwargs['fact_ret'].index[-1].replace('-', '')))
            kwargs['fact_ret'] = kwargs['fact_ret'].reindex(td['date'])

        factor_copy = self.factors_raw.copy(deep=True)
        # 因子符号修改
        for fact_ in factor_copy.columns:
            if self.factor_D[fact_] == '-':
                factor_copy[fact_] = -factor_copy[fact_]
            elif self.factor_D[fact_] == '+':
                pass
            else:
                print(f"{fact_}因子符号有误!")
                return

        # 对因子进行标准化处理

        factor_copy = factor_copy.apply(self.fp.standardization,
                                        args=(stand_method, ))

        comp_factor = self.Multi.composite(factor=factor_copy,
                                           method=method,
                                           **kwargs)

        return comp_factor

    def factor_ret_from_sql(self,
                            factor_name: tuple,
                            sta_date: str = '2013-01-01',
                            end_date: str = '2020-04-01',
                            ret_type: str = 'Pearson',
                            hp: int = 1):

        fact_ret_sql = self.db.query_factor_ret_data(factor_name=factor_name,
                                                     sta_date=sta_date,
                                                     end_date=end_date,
                                                     ret_type=ret_type,
                                                     hp=hp)
        return fact_ret_sql
 def __init__(self):
     self.Q = SQL()
class LabelPool(object):
    PATH = {
        "price":
        os.path.join(FPN.label_pool_path.value, 'StockPrice.csv'),
        "industry":
        os.path.join(FPN.label_pool_path.value, 'IndustryLabel.csv'),
        "composition":
        os.path.join(FPN.label_pool_path.value, 'IndexStockWeight.csv'),
        "index_weight":
        os.path.join(FPN.label_pool_path.value, 'IndexStockWeight.csv'),
        "mv":
        os.path.join(FPN.label_pool_path.value, 'MV.csv'),
    }

    def __init__(self):
        self.Q = SQL()

    def stock_return(self,
                     stock_price: pd.DataFrame,
                     return_type: str = PVN.OPEN.value,
                     label: bool = True) -> pd.Series:
        """
        收益率作为预测标签需放置到前一天, 默认每个交易日至少存在一只股票价格,否则会出现收益率跳空计算现象
        :param stock_price: 股票价格表
        :param return_type: 计算收益率用到的股票价格
        :param label: 是否作为标签
        :return:
        """
        stock_price.sort_index(inplace=True)
        if label:
            if return_type == PVN.OPEN.value:
                result = stock_price[return_type].groupby(
                    as_index=True, level=KN.STOCK_ID.value).apply(
                        lambda x: x.shift(-2) / x.shift(-1) - 1)
            else:
                result = stock_price[return_type].groupby(
                    as_index=True, level=KN.STOCK_ID.value).apply(
                        lambda x: x.shift(-1) / x - 1)
        else:
            if return_type == PVN.OPEN.value:
                result = stock_price[return_type].groupby(
                    as_index=True, level=KN.STOCK_ID.value).apply(
                        lambda x: x.shift(-1) / x - 1)
            else:
                result = stock_price[return_type].groupby(
                    as_index=True, level=KN.STOCK_ID.value).apply(
                        lambda x: x / x.shift(1) - 1)

        result = round(result, 6)
        result.name = KN.STOCK_RETURN.value + '_' + return_type
        return result

    def industry_weight(
            self,
            index_weight: pd.Series,
            industry_exposure: pd.Series,
            index_name: str = SN.CSI_500_INDUSTRY_WEIGHT.value) -> pd.Series:
        """
        生成行业权重
        如果某个行业权重为零则舍弃掉
        """
        data_ = pd.concat([index_weight[index_name], industry_exposure],
                          axis=1,
                          join='inner')
        # industry weight
        ind_weight = data_.groupby(
            [KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value]).sum()
        index_ = industry_exposure.index.get_level_values(
            KN.TRADE_DATE.value).drop_duplicates()
        ind_weight_new = ind_weight.unstack().reindex(index_).fillna(
            method='ffill').stack(dropna=False)

        # fill weight and industry
        res_ = pd.merge(ind_weight_new.reset_index(),
                        industry_exposure.reset_index(),
                        on=[KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value],
                        how='right')
        res_.set_index(['date', 'stock_id'], inplace=True)

        return res_[index_name]

    def industry_mv(self,
                    index_weight: pd.Series,
                    industry_exposure: pd.Series,
                    mv: pd.Series,
                    index_name: str = SN.CSI_300_INDUSTRY_WEIGHT.value,
                    mv_name: str = PVN.LIQ_MV.value) -> pd.Series:

        weight_mv_name = index_name.replace('weight', 'mv')

        data_ = pd.concat(
            [index_weight[index_name], mv[mv_name], industry_exposure],
            axis=1,
            join='inner')
        data_[weight_mv_name] = data_[mv_name] * data_[index_name]

        # industry weight
        ind_mv = data_[[weight_mv_name, SN.INDUSTRY_FLAG.value]].groupby(
            [KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value]).sum()
        index_ = industry_exposure.index.get_level_values(
            KN.TRADE_DATE.value).drop_duplicates()
        ind_weight_new = ind_mv.unstack().reindex(index_).fillna(
            method='ffill').stack(dropna=False)

        # fill weight and industry
        res_ = pd.merge(ind_weight_new.reset_index(),
                        industry_exposure.reset_index(),
                        on=[KN.TRADE_DATE.value, SN.INDUSTRY_FLAG.value],
                        how='right')
        res_.set_index(['date', 'stock_id'], inplace=True)
        # 去除无效市值
        res_ = res_[res_[weight_mv_name] != 0]

        return res_[weight_mv_name]

    def merge_labels(self, **kwargs) -> pd.DataFrame:
        """
        :param kwargs: 股票标签数据
        :return:
        """

        res = pd.concat(kwargs.values(), axis=1)

        return res

    def LabelPool1(self):

        result_path = os.path.join(
            FPN.label_pool_path.value,
            sys._getframe().f_code.co_name + '_result.csv')
        if os.path.exists(result_path):
            category_label = pd.read_csv(
                result_path,
                index_col=[KN.TRADE_DATE.value, KN.STOCK_ID.value])
        else:
            # read data
            print(
                f"{dt.datetime.now().strftime('%X')}: Read the data of label")

            price_data = pd.read_csv(self.PATH["price"])
            industry_data = pd.read_csv(self.PATH["industry"])
            composition_data = pd.read_csv(self.PATH["composition"])
            industry_weight_data = pd.read_csv(self.PATH["index_weight"])
            stock_mv_data = pd.read_csv(self.PATH["mv"])

            # set MultiIndex
            price_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value],
                                 inplace=True)
            industry_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value],
                                    inplace=True)
            composition_data.set_index(
                [KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True)
            industry_weight_data.set_index(
                [KN.TRADE_DATE.value, KN.STOCK_ID.value], inplace=True)
            stock_mv_data.set_index([KN.TRADE_DATE.value, KN.STOCK_ID.value],
                                    inplace=True)

            # adj price
            price_data[[PVN.OPEN.value, PVN.CLOSE.value]] = price_data[[
                PVN.OPEN.value, PVN.CLOSE.value
            ]].mul(price_data[PVN.ADJ_FACTOR.value], axis=0)

            # switch name
            composition_data.rename(columns={
                SN.CSI_50_INDUSTRY_WEIGHT.value:
                SN.CSI_50.value,
                SN.CSI_300_INDUSTRY_WEIGHT.value:
                SN.CSI_300.value,
                SN.CSI_500_INDUSTRY_WEIGHT.value:
                SN.CSI_500.value
            },
                                    inplace=True)

            print(
                f"{dt.datetime.now().strftime('%X')}: calculate stock daily return label"
            )
            stock_return_close = self.stock_return(price_data,
                                                   return_type=PVN.CLOSE.value)
            stock_return_open = self.stock_return(price_data,
                                                  return_type=PVN.OPEN.value)

            print(
                f"{dt.datetime.now().strftime('%X')}: Generate the {SN.CSI_500_INDUSTRY_WEIGHT.value}"
            )
            industry_weight = self.industry_weight(
                industry_weight_data,
                industry_data,
                index_name=SN.CSI_500_INDUSTRY_WEIGHT.value)
            ############################################################################################################
            # merge labels
            print(f"{dt.datetime.now().strftime('%X')}: Merge labels")
            category_label = self.merge_labels(
                data_ret_close=stock_return_close,
                data_ret_open=stock_return_open,
                composition=composition_data,
                industry_exposure=industry_data,
                index_weight=industry_weight,
                mv=stock_mv_data[PVN.LIQ_MV.value])

            # sort
            category_label.sort_index(inplace=True)

            category_label.to_csv(result_path)
        return category_label

    def BenchMark(self,
                  bm_index: str = '000300.SH',
                  sta: str = '20130101',
                  end: str = '20200401',
                  price: str = 'open'):
        """
        返回基准当天收益
        :param bm_index:
        :param sta:
        :param end:
        :param price:
        :return:
        """
        sql_ = self.Q.stock_index_SQL(bm_index=bm_index,
                                      date_sta=sta,
                                      date_end=end)
        index_ = self.Q.query(sql_)
        index_.set_index(KN.TRADE_DATE.value, inplace=True)
        result = index_[price].shift(-1) / index_[price] - 1
        return result