Exemple #1
0
def calc_future_ret(date, ndays):
    """
    计算date日期ndays个交易日前个股的未来1至ndays天的各个区间收益率数据
    Parameters:
    --------
    :param date: datetime-like, str
        日期, e.g: YYYY-MM-DD, YYYYMMDD
    :param ndays: int
        天数
    :return:
    """
    # 读取过去ndays+1个交易日序列
    trading_days_series = Utils.get_trading_days(end=date, ndays=ndays+1)
    # 读取个股基本信息
    stock_basics = Utils.get_stock_basics(trading_days_series[0])
    # 从第2天开始遍历trading_days_series, 计算各个区间收益率数据
    headers = ['code'] + ['day'+str(k) for k in range(1, ndays+1)]
    df_future_ret = pd.DataFrame(columns=headers)
    for _, stock_info in stock_basics.iterrows():
        future_ret = pd.Series()
        future_ret['code'] = stock_info.symbol
        for k in range(1, ndays+1):
            future_ret['day'+str(k)] = Utils.calc_interval_ret(stock_info.symbol, start=trading_days_series[1], end=trading_days_series[k])
            if future_ret['day'+str(k)] is None:
                future_ret['day'+str(k)] = np.nan
            else:
                future_ret['day' + str(k)] = round(future_ret['day' + str(k)], 6)
        df_future_ret = df_future_ret.append(future_ret, ignore_index=True)
        df_future_ret.dropna(axis=0, how='any', inplace=True)

    # 保存数据
    cfg = ConfigParser()
    cfg.read('config.ini')
    future_ret_path = os.path.join(SETTINGS.FACTOR_DB_PATH, cfg.get('future_ret', 'ret_path'), '{}.csv'.format(Utils.datetimelike_to_str(trading_days_series[0], dash=False)))
    df_future_ret.to_csv(future_ret_path, index=False, encoding='utf-8')
Exemple #2
0
def load_ipo_info():
    """从网易财经下载个股的IPO数据"""
    cfg = ConfigParser()
    cfg.read('config.ini')
    ipo_info_url = cfg.get('ipo_info', 'ipo_info_url')
    db_path = Path(cfg.get('factor_db', 'db_path'),
                   cfg.get('ipo_info', 'db_path'))
    # 读取所有已上市个股代码
    # data_api = DataApi(addr='tcp://data.tushare.org:8910')
    # data_api.login('13811931480', 'eyJhbGciOiJIUzI1NiJ9.eyJjcmVhdGVfdGltZSI6IjE1MTI4Nzk0NTI2MjkiLCJpc3MiOiJhdXRoMCIsImlkIjoiMTM4MTE5MzE0ODAifQ.I0SXsA1bK--fbGu0B5Is2xdKOjALAeWBJRX6GdVmUL8')
    # df_stock_basics, msg = data_api.query(view='jz.instrumentInfo',
    #                                       fields='status,list_date,name,market',
    #                                       filter='inst_type=1&status=&market=SH,SZ&symbol=',
    #                                       data_format='pandas')
    # if msg != '0,':
    #     print('读取市场个股代码失败。')
    #     return
    # df_stock_basics.symbol = df_stock_basics.symbol.map(lambda x: x.split('.')[0])

    df_stock_basics = Utils.get_stock_basics(all=True)
    # 遍历个股, 下载ipo信息数据
    df_ipo_info = DataFrame()
    for _, stock_info in df_stock_basics.iterrows():
        # 如果个股ipo数据已存在, 则跳过
        if db_path.joinpath('%s.csv' % stock_info.symbol).exists():
            continue

        print('下载%s的IPO数据.' % stock_info.symbol)
        ipo_info_header = []
        ipo_info_data = []

        secu_code = Utils.code_to_symbol(stock_info.symbol)
        url = ipo_info_url % stock_info.symbol[2:]
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.find_all(name='h2')
        for tag in tags:
            if tag.get_text().strip() == 'IPO资料':
                ipo_table = tag.find_next(name='table')
                for tr in ipo_table.find_all(name='tr'):
                    tds = tr.find_all(name='td')
                    name = tds[0].get_text().replace(' ', '').replace(
                        '\n', '').replace('\r', '')
                    value = tds[1].get_text().replace(' ', '').replace(
                        ',', '').replace('\n', '').replace('\r', '')
                    ipo_info_header.append(name)
                    ipo_info_data.append(value)
                ipo_info = Series(ipo_info_data, index=ipo_info_header)
                ipo_info['代码'] = secu_code
                ipo_info.to_csv(db_path.joinpath('%s.csv' % secu_code))
                df_ipo_info = df_ipo_info.append(ipo_info, ignore_index=True)
                break
    if not df_ipo_info.empty:
        df_ipo_info.to_csv(db_path.joinpath('ipo_info.csv'),
                           index=False,
                           mode='a',
                           header=False)
Exemple #3
0
def load_fin_data_cwbbzy():
    """导入上市公司财务报表摘要"""
    cfg = ConfigParser()
    cfg.read('config.ini')
    cwbbzy_url = cfg.get('fin_data', 'cwbbzy_url')
    cwbbzy_path = os.path.join(cfg.get('factor_db', 'db_path'),
                               cfg.get('fin_data', 'cwbbzy_path'))
    # 读取个股代码
    # data_api = DataApi(addr='tcp://data.tushare.org:8910')
    # data_api.login('13811931480',
    #                'eyJhbGciOiJIUzI1NiJ9.eyJjcmVhdGVfdGltZSI6IjE1MTI4Nzk0NTI2MjkiLCJpc3MiOiJhdXRoMCIsImlkIjoiMTM4MTE5MzE0ODAifQ.I0SXsA1bK--fbGu0B5Is2xdKOjALAeWBJRX6GdVmUL8')
    # df_stock_basics, msg = data_api.query(view='jz.instrumentInfo',
    #                                       fields='status,list_date,name,market',
    #                                       filter='inst_type=1&status=1&market=SH,SZ&symbol=',
    #                                       data_format='pandas')
    # if msg != '0,':
    #     print('读取市场个股代码失败。')
    #     return
    # df_stock_basics.symbol = df_stock_basics.symbol.map(lambda x: x.split('.')[0])

    df_stock_basics = Utils.get_stock_basics(all=True)
    # 遍历个股, 下载财务报表摘要数据
    for _, stock_info in df_stock_basics.iterrows():
        url = cwbbzy_url % stock_info.symbol[-6:]
        resp = requests.get(url)
        if resp.status_code != requests.codes.ok:
            print('%s的财务报表摘要数据下载失败!' % stock_info.symbol)
            continue
        print('下载%s的财务报表摘要数据.' % stock_info.symbol)
        fin_data = resp.text
        if '暂无数据' in fin_data:
            continue
        tmp = fin_data.split(',')[-1]
        fin_data = fin_data.replace(tmp, '')
        fin_data = fin_data.split('\r\n')
        fin_datas = []
        for data in fin_data:
            s = data.split(',')
            fin_datas.append(s[:-1])
        n = min([len(data) for data in fin_datas])
        dict_fin_data = {data[0]: data[1:n] for data in fin_datas}
        fin_header = [data[0] for data in fin_datas]
        df_fin_data = DataFrame(dict_fin_data, columns=fin_header)
        df_fin_data = df_fin_data.sort_values(by=fin_header[0])
        df_fin_data.to_csv(os.path.join(
            cwbbzy_path, '%s.csv' % Utils.code_to_symbol(stock_info.symbol)),
                           index=False)
def _check_dlisted_indclassify():
    """检查退市股票行业代码分类"""
    # 读取退市股票行业分类数据
    cfg = ConfigParser()
    cfg.read('config.ini')
    delisted_data_path = os.path.join(
        cfg.get('factor_db', 'db_path'),
        cfg.get('industry_classify', 'classify_data_path'),
        'delisted_classify_sw.csv')
    df_delisted_indclassify = pd.read_csv(delisted_data_path, header=0)
    # 读取已退市个股基本信息数据
    df_stock_basics = Utils.get_stock_basics(all=True)
    df_delisted_basics = df_stock_basics[df_stock_basics['status'] == 3]
    # 检查退市股票行业分类数据中是否已包含所有的已退市股票
    df_delisted_basics = df_delisted_basics[~df_delisted_basics['symbol'].isin(
        df_delisted_indclassify['id'].tolist())]
    if ~df_delisted_basics.empty:
        print('\033[1;31;40m个股{}已退市, 需加入退市股票行业分类数据中.\033[0m'.format(
            str(df_delisted_basics['symbol'].tolist())))
Exemple #5
0
def calc_suspension_info(date):
    """
    计算个股停牌信息
    Parameters:
    --------
    :param date: datetime-like, str
        计算日期, e.g: YYYY-MM-DD, YYYYMMDD
    :return:
    """
    # TODO 可以更改为从tushare.pro接口取得个股停牌信息

    date = Utils.to_date(date)
    df_stock_basics = Utils.get_stock_basics(date)
    df_stock_basics['trading_status'] = df_stock_basics.apply(lambda x: Utils.trading_status(x['symbol'], date), axis=1)
    df_stock_basics = df_stock_basics[df_stock_basics['trading_status'] == SecuTradingStatus.Suspend]
    df_stock_basics.drop(columns='trading_status', inplace=True)

    cfg = ConfigParser()
    cfg.read('config.ini')
    suspension_info_path = os.path.join(SETTINGS.FACTOR_DB_PATH, cfg.get('suspension_info', 'info_path'), '{}.csv'.format(Utils.datetimelike_to_str(date, dash=False)))
    df_stock_basics.to_csv(suspension_info_path, index=False, encoding='utf-8')
Exemple #6
0
    def calc_factor_loading(cls,
                            start_date,
                            end_date=None,
                            month_end=True,
                            save=False,
                            **kwargs):
        """
        计算指定日期的样本个股的因子载荷,并保存至因子数据库
        Parameters
        --------
        :param start_date: datetime-like, str
            开始日期
        :param end_date: datetime-like, str,默认None
            结束日期,如果为None,则只计算start_date日期的因子载荷
        :param month_end: bool,默认True
            只计算月末时点的因子载荷
        :param save: 是否保存至因子数据库,默认为False
        :param kwargs:
            'multi_proc': bool, True=采用多进程并行计算, False=采用单进程计算, 默认为False
        :return: 因子载荷,DataFrame
        --------
            因子载荷,DataFrame
            0. date, 日期, 为计算日期的下一个交易日
            1: id, 证券代码
            2: factorvalue, 因子载荷
            如果end_date=None,返回start_date对应的因子载荷数据
            如果end_date!=None,返回最后一天的对应的因子载荷数据
            如果没有计算数据,返回None
        """
        # 0.取得交易日序列
        start_date = Utils.to_date(start_date)
        if end_date is not None:
            end_date = Utils.to_date(end_date)
            trading_days_series = Utils.get_trading_days(start=start_date,
                                                         end=end_date)
        else:
            trading_days_series = Utils.get_trading_days(end=start_date,
                                                         ndays=1)
        # 取得样本个股信息
        # all_stock_basics = CDataHandler.DataApi.get_secu_basics()
        # 遍历交易日序列,计算SMartQ因子载荷
        dict_factor = None
        for calc_date in trading_days_series:
            dict_factor = {'id': [], 'factorvalue': []}
            if month_end and (not Utils.is_month_end(calc_date)):
                continue
            # 1.获取用于读取分钟行情的交易日列表(过去30天的交易日列表,降序排列)
            # trading_days = _get_trading_days(calc_date, 30)
            # trading_days = Utils.get_trading_days(end=calc_date, ndays=30, ascending=False)
            # 2.取得样本个股信息
            # stock_basics = ts.get_stock_basics()
            s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d')
            stock_basics = Utils.get_stock_basics(s)
            # 3.遍历样本个股代码,计算Smart_Q因子载荷值
            dict_factor = {'date': None, 'id': [], 'factorvalue': []}

            if 'multi_proc' not in kwargs:
                kwargs['multi_proc'] = False
            if not kwargs['multi_proc']:
                # 采用单进程进行计算
                for _, stock_info in stock_basics.iterrows():
                    # code = '%s%s' % ('SH' if code[:2] == '60' else 'SZ', code)
                    factor_loading = cls._calc_factor_loading(
                        stock_info.symbol, calc_date)
                    print(
                        "[%s]Calculating %s's SmartMoney factor loading = %.4f."
                        % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol,
                           -1.0 if factor_loading is None else factor_loading))
                    if factor_loading is not None:
                        # df_factor.ix[code, 'factorvalue'] = factor_loading
                        dict_factor['id'].append(
                            Utils.code_to_symbol(stock_info.symbol))
                        dict_factor['factorvalue'].append(factor_loading)
            else:
                # 采用多进程并行计算SmartQ因子载荷
                q = Manager().Queue()  # 队列,用于进程间通信,存储每个进程计算的因子载荷值
                p = Pool(4)  # 进程池,最多同时开启4个进程
                for _, stock_info in stock_basics.iterrows():
                    p.apply_async(cls._calc_factor_loading_proc,
                                  args=(
                                      stock_info.symbol,
                                      calc_date,
                                      q,
                                  ))
                p.close()
                p.join()
                while not q.empty():
                    smart_q = q.get(True)
                    dict_factor['id'].append(smart_q[0])
                    dict_factor['factorvalue'].append(smart_q[1])

            date_label = Utils.get_trading_days(calc_date, ndays=2)[1]
            dict_factor['date'] = [date_label] * len(dict_factor['id'])
            # 4.计算去极值标准化后的因子载荷
            df_std_factor = Utils.normalize_data(pd.DataFrame(dict_factor),
                                                 columns='factorvalue',
                                                 treat_outlier=True,
                                                 weight='eq')
            # 5.保存因子载荷至因子数据库
            if save:
                # Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_factor)
                cls._save_factor_loading(cls._db_file,
                                         Utils.datetimelike_to_str(calc_date,
                                                                   dash=False),
                                         dict_factor,
                                         'SmartMoney',
                                         factor_type='raw',
                                         columns=['date', 'id', 'factorvalue'])
                cls._save_factor_loading(cls._db_file,
                                         Utils.datetimelike_to_str(calc_date,
                                                                   dash=False),
                                         df_std_factor,
                                         'SmartMoney',
                                         factor_type='standardized',
                                         columns=['date', 'id', 'factorvalue'])
            # 休息300秒
            logging.info('Suspending for 360s.')
            time.sleep(360)
        return dict_factor
Exemple #7
0
    def calc_factor_loading(cls,
                            start_date,
                            end_date=None,
                            month_end=True,
                            save=False,
                            **kwargs):
        """
        计算指定日期的样本个股的因子载荷, 并保存至因子数据库
        Parameters:
        --------
        :param start_date: datetime-like, str
            开始日期, 格式: YYYY-MM-DD or YYYYMMDD
        :param end_date: datetime-like, str
            结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD
        :param month_end: bool, 默认为True
            如果为True, 则只计算月末时点的因子载荷
        :param save: bool, 默认为True
            是否保存至因子数据库
        :param kwargs:
            'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False
        :return: dict
            因子载荷数据
        """
        # 取得交易日序列及股票基本信息表
        start_date = Utils.to_date(start_date)
        if end_date is not None:
            end_date = Utils.to_date(end_date)
            trading_days_series = Utils.get_trading_days(start=start_date,
                                                         end=end_date)
        else:
            trading_days_series = Utils.get_trading_days(end=start_date,
                                                         ndays=1)
        # all_stock_basics = CDataHandler.DataApi.get_secu_basics()
        # 遍历交易日序列, 计算BTOP因子载荷
        dict_btop = None
        for calc_date in trading_days_series:
            if month_end and (not Utils.is_month_end(calc_date)):
                continue
            logging.info('[%s] Calc BTOP factor loading.' %
                         Utils.datetimelike_to_str(calc_date))
            # 遍历个股, 计算个股的BTOP因子值
            # s = (calc_date - datetime.timedelta(days=risk_ct.BTOP_CT.listed_days)).strftime('%Y%m%d')
            # stock_basics = all_stock_basics[all_stock_basics.list_date < s]
            s = calc_date - datetime.timedelta(
                days=risk_ct.BTOP_CT.listed_days)
            stock_basics = Utils.get_stock_basics(s, False)
            ids = []  # 个股代码list
            btops = []  # BTOP因子值list

            if 'multi_proc' not in kwargs:
                kwargs['multi_proc'] = False
            if not kwargs['multi_proc']:
                # 采用单进程计算BTOP因子值
                for _, stock_info in stock_basics.iterrows():
                    logging.debug(
                        "[%s] Calc %s's BTOP factor loading." %
                        (Utils.datetimelike_to_str(
                            calc_date, dash=True), stock_info.symbol))
                    btop_data = cls._calc_factor_loading(
                        stock_info.symbol, calc_date)
                    if btop_data is None:
                        ids.append(Utils.code_to_symbol(stock_info.symbol))
                        btops.append(np.nan)
                    else:
                        ids.append(btop_data['code'])
                        btops.append(btop_data['btop'])
            else:
                # 采用多进程并行计算BTOP因子值
                q = Manager().Queue()  # 队列, 用于进程间通信, 存储每个进程计算的因子载荷
                p = Pool(SETTINGS.CONCURRENCY_KERNEL_NUM)  # 进程池, 最多同时开启4个进程
                for _, stock_info in stock_basics.iterrows():
                    p.apply_async(cls._calc_factor_loading_proc,
                                  args=(
                                      stock_info.symbol,
                                      calc_date,
                                      q,
                                  ))
                p.close()
                p.join()
                while not q.empty():
                    btop_data = q.get(True)
                    ids.append(btop_data['code'])
                    btops.append(btop_data['btop'])

            date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1]
            dict_btop = {
                'date': [date_label] * len(ids),
                'id': ids,
                'factorvalue': btops
            }
            if save:
                Utils.factor_loading_persistent(
                    cls._db_file,
                    Utils.datetimelike_to_str(calc_date, dash=False),
                    dict_btop, ['date', 'id', 'factorvalue'])
            # 暂停180秒
            # logging.info('Suspending for 180s.')
            # time.sleep(180)
        return dict_btop
Exemple #8
0
    def calc_factor_loading(cls,
                            start_date,
                            end_date=None,
                            month_end=True,
                            save=False,
                            **kwargs):
        """
        计算指定日期的样本个股的因子载荷,并保存至因子数据库
        Parameters
        --------
        :param start_date: datetime-like, str
            开始日期
        :param end_date: datetime-like, str,默认None
            结束日期,如果为None,则只计算start_date日期的因子载荷
        :param month_end: bool,默认True
            只计算月末时点的因子载荷,该参数只在end_date不为None时有效,并且不论end_date是否为None,都会计算第一天的因子载荷
        :param save: 是否保存至因子数据库,默认为False
        :param kwargs:
            'multi_proc': bool, True=采用多进程并行计算, False=采用单进程计算, 默认为False
        :return: 因子载荷,DataFrame
        --------
            因子载荷,DataFrame
            0: id, 证券ID
            1: factorvalue, 因子载荷
            如果end_date=None,返回start_date对应的因子载荷数据
            如果end_date!=None,返回最后一天的对应的因子载荷数据
            如果没有计算数据,返回None
        """
        # 1.取得交易日序列及股票基本信息表
        start_date = Utils.to_date(start_date)
        if end_date is not None:
            end_date = Utils.to_date(end_date)
            trading_days_series = Utils.get_trading_days(start=start_date,
                                                         end=end_date)
        else:
            trading_days_series = Utils.get_trading_days(end=start_date,
                                                         ndays=1)
        # all_stock_basics = CDataHandler.DataApi.get_secu_basics()
        # 2.遍历交易日序列,计算APM因子载荷
        dict_apm = None
        for calc_date in trading_days_series:
            dict_apm = {'date': [], 'id': [], 'factorvalue': []}
            if month_end and (not Utils.is_month_end(calc_date)):
                continue
            # 2.1.遍历个股,计算个股APM.stat统计量,过去20日收益率,分别放进stat_lst,ret20_lst列表中
            s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d')
            stock_basics = Utils.get_stock_basics(s)
            stat_lst = []
            ret20_lst = []
            symbol_lst = []

            if 'multi_proc' not in kwargs:
                kwargs['multi_proc'] = False
            if not kwargs['multi_proc']:
                # 采用单进程计算
                for _, stock_info in stock_basics.iterrows():
                    stat_i = cls._calc_factor_loading(stock_info.symbol,
                                                      calc_date)
                    ret20_i = Utils.calc_interval_ret(stock_info.symbol,
                                                      end=calc_date,
                                                      ndays=20)
                    if stat_i is not None and ret20_i is not None:
                        stat_lst.append(stat_i)
                        ret20_lst.append(ret20_i)
                        symbol_lst.append(
                            Utils.code_to_symbol(stock_info.symbol))
                        logging.info('APM of %s = %f' %
                                     (stock_info.symbol, stat_i))
            else:
                # 采用多进程并行计算
                q = Manager().Queue()
                p = Pool(4)  # 最多同时开启4个进程
                for _, stock_info in stock_basics.iterrows():
                    p.apply_async(cls._calc_factor_loading_proc,
                                  args=(
                                      stock_info.symbol,
                                      calc_date,
                                      q,
                                  ))
                p.close()
                p.join()
                while not q.empty():
                    apm_value = q.get(True)
                    symbol_lst.append(apm_value[0])
                    stat_lst.append(apm_value[1])
                    ret20_lst.append(apm_value[2])

            assert len(stat_lst) == len(ret20_lst)
            assert len(stat_lst) == len(symbol_lst)

            # 2.2.构建APM因子
            # 2.2.1.将统计量stat对动量因子ret20j进行截面回归:stat_j = \beta * Ret20_j + \epsilon_j
            #     残差向量即为对应个股的APM因子
            # 截面回归之前,先对stat统计量和动量因子进行去极值和标准化处理
            stat_arr = np.array(stat_lst).reshape((len(stat_lst), 1))
            ret20_arr = np.array(ret20_lst).reshape((len(ret20_lst), 1))
            stat_arr = Utils.clean_extreme_value(stat_arr)
            stat_arr = Utils.normalize_data(stat_arr)
            ret20_arr = Utils.clean_extreme_value(ret20_arr)
            ret20_arr = Utils.normalize_data(ret20_arr)
            # 回归分析
            # ret20_arr = sm.add_constant(ret20_arr)
            apm_model = sm.OLS(stat_arr, ret20_arr)
            apm_result = apm_model.fit()
            apm_lst = list(np.around(apm_result.resid, 6))  # amp因子载荷精确到6位小数
            assert len(apm_lst) == len(symbol_lst)
            # 2.2.2.构造APM因子字典,并持久化
            date_label = Utils.get_trading_days(calc_date, ndays=2)[1]
            dict_apm = {
                'date': [date_label] * len(symbol_lst),
                'id': symbol_lst,
                'factorvalue': apm_lst
            }
            df_std_apm = Utils.normalize_data(pd.DataFrame(dict_apm),
                                              columns='factorvalue',
                                              treat_outlier=True,
                                              weight='eq')
            if save:
                # Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_apm)
                cls._save_factor_loading(cls._db_file,
                                         Utils.datetimelike_to_str(calc_date,
                                                                   dash=False),
                                         dict_apm,
                                         'APM',
                                         factor_type='raw',
                                         columns=['date', 'id', 'factorvalue'])
                cls._save_factor_loading(cls._db_file,
                                         Utils.datetimelike_to_str(calc_date,
                                                                   dash=False),
                                         df_std_apm,
                                         'APM',
                                         factor_type='standardized',
                                         columns=['date', 'id', 'factorvalue'])

            # # 2.3.构建PureAPM因子
            # # 将stat_arr转换为DataFrame, 此时的stat_arr已经经过了去极值和标准化处理
            # df_stat = DataFrame(stat_arr, index=symbol_lst, columns=['stat'])
            # # 取得提纯的因变量因子
            # df_dependent_factor = cls.get_dependent_factors(calc_date)
            # # 将df_stat和因变量因子拼接
            # df_data = pd.concat([df_stat, df_dependent_factor], axis=1, join='inner')
            # # OLS回归,提纯APM因子
            # arr_data = np.array(df_data)
            # pure_apm_model = sm.OLS(arr_data[:, 0], arr_data[:, 1:])
            # pure_apm_result = pure_apm_model.fit()
            # pure_apm_lst = list(np.around(pure_apm_result.resid, 6))
            # pure_symbol_lst = list(df_data.index)
            # assert len(pure_apm_lst) == len(pure_symbol_lst)
            # # 构造pure_apm因子字典,并持久化
            # dict_pure_apm = {'date': [date_label]*len(pure_symbol_lst), 'id': pure_symbol_lst, 'factorvalue': pure_apm_lst}
            # pure_apm_db_file = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_apm_db_file)
            # if save:
            #     Utils.factor_loading_persistent(pure_apm_db_file, calc_date.strftime('%Y%m%d'), dict_pure_apm)
            # # 休息360秒
            # logging.info('Suspended for 360s.')
            # time.sleep(360)
        return dict_apm
Exemple #9
0
    def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs):
        """
        计算指定日期的样本个股的因子载荷,并保存至因子数据库
        Parameters
        --------
        :param start_date: datetime-like, str
            开始日期,格式:YYYY-MM-DD or YYYYMMDD
        :param end_date: datetime-like, str
            结束日期,如果为None,则只计算start_date日期的因子载荷,格式:YYYY-MM-DD or YYYYMMDD
        :param month_end: bool,默认True
            如果为True,则只计算月末时点的因子载荷
        :param save: bool,默认False
            是否保存至因子数据库
        :param kwargs:
            'multi_proc': bool, True=采用多进程并行计算, False=采用单进程计算, 默认为False
        :return: 因子载荷,DataFrame
        --------
            因子载荷,DataFrame
            0. date: 日期
            1. id: 证券symbol
            2. m0: 隔夜时段动量
            3. m1: 第一个小时动量
            4. m2: 第二个小时动量
            5. m3: 第三个小时动量
            6. m4: 第四个小时动量
            7. m_normal: 传统动量
        """
        # 取得交易日序列及股票基本信息表
        start_date = Utils.to_date(start_date)
        if end_date is not None:
            end_date = Utils.to_date(end_date)
            trading_days_series = Utils.get_trading_days(start=start_date, end=end_date)
        else:
            trading_days_series = Utils.get_trading_days(end=start_date, ndays=1)
        # all_stock_basics = CDataHandler.DataApi.get_secu_basics()
        # 遍历交易日序列,计算日内动量因子值
        dict_intraday_momentum = None
        for calc_date in trading_days_series:
            if month_end and (not Utils.is_month_end(calc_date)):
                continue

            # 计算日内各时段动量因子
            dict_intraday_momentum = {'date': [], 'id': [], 'm0': [], 'm1': [],
                                      'm2': [], 'm3': [], 'm4': [], 'm_normal': []}
            # 遍历个股,计算个股日内动量值
            s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d')
            stock_basics = Utils.get_stock_basics(s)

            if 'multi_proc' not in kwargs:
                kwargs['multi_proc'] = False
            if not kwargs['multi_proc']:
                # 采用单进程进行计算
                for _, stock_info in stock_basics.iterrows():
                    momentum_data = cls._calc_factor_loading(stock_info.symbol, calc_date)
                    if momentum_data is not None:
                        logging.info("[%s] %s's intraday momentum = (%0.4f,%0.4f,%0.4f,%0.4f,%0.4f,%0.4f)" % (calc_date.strftime('%Y-%m-%d'),stock_info.symbol, momentum_data.m0, momentum_data.m1, momentum_data.m2, momentum_data.m3, momentum_data.m4, momentum_data.m_normal))
                        dict_intraday_momentum['id'].append(Utils.code_to_symbol(stock_info.symbol))
                        dict_intraday_momentum['m0'].append(round(momentum_data.m0, 6))
                        dict_intraday_momentum['m1'].append(round(momentum_data.m1, 6))
                        dict_intraday_momentum['m2'].append(round(momentum_data.m2, 6))
                        dict_intraday_momentum['m3'].append(round(momentum_data.m3, 6))
                        dict_intraday_momentum['m4'].append(round(momentum_data.m4, 6))
                        dict_intraday_momentum['m_normal'].append(round(momentum_data.m_normal, 6))
            else:
                # 采用多进程并行计算日内动量因子载荷
                q = Manager().Queue()   # 队列,用于进程间通信,存储每个进程计算的因子载荷
                p = Pool(4)             # 进程池,最多同时开启4个进程
                for _, stock_info in stock_basics.iterrows():
                    p.apply_async(cls._calc_factor_loading_proc, args=(stock_info.symbol, calc_date, q,))
                p.close()
                p.join()
                while not q.empty():
                    momentum_data = q.get(True)
                    dict_intraday_momentum['id'].append(momentum_data[0])
                    dict_intraday_momentum['m0'].append(round(momentum_data[1], 6))
                    dict_intraday_momentum['m1'].append(round(momentum_data[2], 6))
                    dict_intraday_momentum['m2'].append(round(momentum_data[3], 6))
                    dict_intraday_momentum['m3'].append(round(momentum_data[4], 6))
                    dict_intraday_momentum['m4'].append(round(momentum_data[5], 6))
                    dict_intraday_momentum['m_normal'].append(round(momentum_data[6], 6))

            date_label = Utils.get_trading_days(calc_date, ndays=2)[1]
            dict_intraday_momentum['date'] = [date_label] * len(dict_intraday_momentum['id'])

            # 保存因子载荷至因子数据库
            if save:
                # Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_intraday_momentum)
                cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_intraday_momentum, 'periodmomentum', factor_type='raw')

            # 计算日内各时段动量因子的Rank IC值向量, 并保存
            cls._calc_periodmomentum_ic(calc_date, 'month')

            # 计算最优化权重
            if alphafactor_ct.INTRADAYMOMENTUM_CT['optimized']:
                cls._optimize_periodmomentum_weight(calc_date)

            # 计算合成日内动量因子
            if alphafactor_ct.INTRADAYMOMENTUM_CT['synthesized']:
                logging.info('[%s] calc synthetic intraday momentum factor loading.' % Utils.datetimelike_to_str(calc_date))
                dict_intraday_momentum = {'date': [], 'id': [], 'factorvalue': []}
                # 读取日内个时段动量因子值
                # period_momentum_path = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.INTRADAYMOMENTUM_CT.db_file, 'raw/periodmomentum')
                # df_factor_loading = Utils.read_factor_loading(period_momentum_path, Utils.datetimelike_to_str(calc_date, False))
                df_factor_loading = cls._get_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), factor_name='periodmomentum', factor_type='raw', drop_na=False)
                if df_factor_loading.shape[0] <= 0:
                    logging.info("[%s] It doesn't exist intraday momentum factor loading." % Utils.datetimelike_to_str(calc_date))
                    return
                df_factor_loading.fillna(0, inplace=True)
                # 读取因子最优权重
                factor_weight = cls._get_factor_weight(calc_date)
                if factor_weight is None:
                    logging.info("[%s] It doesn't exist factor weight.")
                    return
                # 计算合成动量因子, 合成之前先对日内各时段动量因子进行去极值和标准化处理
                arr_factor_loading = np.array(df_factor_loading[['m0', 'm1', 'm2', 'm3', 'm4']])
                arr_factor_loading = Utils.normalize_data(arr_factor_loading, treat_outlier=True)
                arr_factor_weight = np.array(factor_weight.drop('date')).reshape((5, 1))
                arr_synthetic_factor = np.dot(arr_factor_loading, arr_factor_weight)
                dict_intraday_momentum['date'] = list(df_factor_loading['date'])
                dict_intraday_momentum['id'] = list(df_factor_loading['id'])
                dict_intraday_momentum['factorvalue'] = list(arr_synthetic_factor.astype(float).round(6).reshape((arr_synthetic_factor.shape[0],)))
                # 标准化合成动量因子
                df_std_intradaymonmentum = Utils.normalize_data(pd.DataFrame(dict_intraday_momentum), columns='factorvalue', treat_outlier=True, weight='eq')
                # 保存合成因子
                if save:
                    # Utils.factor_loading_persistent(synthetic_db_file, Utils.datetimelike_to_str(calc_date, False), dict_intraday_momentum)
                    cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_intraday_momentum, 'IntradayMomentum', factor_type='raw', columns=['date', 'id', 'factorvalue'])
                    cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), df_std_intradaymonmentum, 'IntradayMomentum', factor_type='standardized', columns=['date', 'id', 'factorvalue'])

            # 休息360秒
            logging.info('Suspending for 360s.')
            time.sleep(360)
        return dict_intraday_momentum