def _calc_factor_loading_proc(cls, code, calc_date, q): """ 用于并行计算因子载荷 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :param q: 队列, 用于进程间通信 :return: 添加因子载荷至队列 """ logging.debug('[{}] Calc BLEV factor of {}.'.format( Utils.datetimelike_to_str(calc_date), code)) blev_data = None try: blev_data = cls._calc_factor_loading(code, calc_date) except Exception as e: print(e) if blev_data is None: blev_data = pd.Series([Utils.code_to_symbol(code), np.nan], index=['code', 'blev']) q.put(blev_data)
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股EPFWD因子载荷 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的EPFWD因子载荷 0. code 1. epfwd 如果计算失败, 返回None """ code = Utils.code_to_symbol(code) # 读取个股的预期盈利数据 predictedearnings_data = Utils.get_consensus_data(calc_date, code, ConsensusType.PredictedEarings) if predictedearnings_data is None: # 如果个股的预期盈利数据不存在, 那么代替ttm净利润 ttm_fin_data = Utils.get_ttm_fin_basic_data(code, calc_date) if ttm_fin_data is None: return None predictedearnings_data = pd.Series([code, ttm_fin_data['NetProfit']], index=['code', 'predicted_earnings']) fpredictedearnings = predictedearnings_data['predicted_earnings'] if np.isnan(fpredictedearnings): return None # 读取个股市值 size_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.LNCAP_CT.db_file) size_factor_loading = Utils.read_factor_loading(size_path, Utils.datetimelike_to_str(calc_date, dash=False), code) if size_factor_loading.empty: return None # epfwd = 盈利预期/市值 epfwd = fpredictedearnings * 10000.0 / np.exp(size_factor_loading['factorvalue']) return pd.Series([code, epfwd], index=['code', 'epfwd'])
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股BLEV因子载荷 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的BLEV因子载荷 0. code 1. blev 如果计算失败, 返回None """ code = Utils.code_to_symbol(code) report_date = Utils.get_fin_report_date(calc_date) # 读取个股最新财务报表摘要数据 fin_summary_data = Utils.get_fin_summary_data(code, report_date) if fin_summary_data is None: return None be = fin_summary_data['TotalShareholderEquity'] if np.isnan(be): return None if abs(be) < utils_con.TINY_ABS_VALUE: return None ld = fin_summary_data['TotalNonCurrentLiabilities'] if np.isnan(ld): ld = fin_summary_data['TotalLiabilities'] if np.isnan(ld): return None pe = 0 # blev = (be + pe + ld) / be blev = (be + pe + ld) / be return pd.Series([code, blev], index=['code', 'blev'])
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股DASTD因子载荷 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的DASTD因子载荷 0. code 1. dastr 如果计算失败, 返回None """ # 取得个股复权行情数据 df_secu_quote = Utils.get_secu_daily_mkt(code, end=calc_date, ndays=risk_ct.DASTD_CT.trailing+1, fq=True) if df_secu_quote is None: return None df_secu_quote.reset_index(drop=True, inplace=True) # 计算个股的日对数收益率序列及收益率均值 arr_secu_close = np.array(df_secu_quote.iloc[1:]['close']) arr_secu_preclose = np.array(df_secu_quote.shift(1).iloc[1:]['close']) arr_secu_daily_ret = np.log(arr_secu_close / arr_secu_preclose) avg_daily_ret = np.mean(arr_secu_daily_ret) # 计算权重(指数移动加权平均) T = len(arr_secu_daily_ret) time_spans = sorted(range(T), reverse=True) alpha = 1 - np.exp(np.log(0.5)/risk_ct.DASTD_CT.half_life) x = [1-alpha] * T y = [alpha] * (T-1) y.insert(0, 1) weights = np.float_power(x, time_spans) * y # 计算个股DASTD因子值 dastd = np.sqrt(np.sum((arr_secu_daily_ret - avg_daily_ret) ** 2 * weights)) return pd.Series([Utils.code_to_symbol(code), dastd], index=['code', 'dastd'])
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股ETOP因子载荷 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的ETOP因子载荷 0. code 1. etop 如果计算失败, 返回None """ code = Utils.code_to_symbol(code) # 读取个股的ttm净利润 ttm_fin_data = Utils.get_ttm_fin_basic_data(code, calc_date) if ttm_fin_data is None: return None ttm_netprofit = ttm_fin_data['NetProfit'] if np.isnan(ttm_netprofit): return None # 读取个股市值 lncap_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.LNCAP_CT.db_file) lncap_data = Utils.read_factor_loading( lncap_path, Utils.datetimelike_to_str(calc_date, dash=False), code) if lncap_data.empty: return None secu_cap = np.exp(lncap_data['factorvalue']) # etop = ttm净利润/市值 etop = ttm_netprofit * 10000 / secu_cap return pd.Series([code, etop], index=['code', 'etop'])
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股的价值因子,包含ep_ttm, bp_lr, ocf_ttm Parameters: -------- :param code: str 个股代码:如600000或SH600000 :param calc_date: datetime-like or str 计算日期,格式YYYY-MM-DD, YYYYMMDD :return: pd.Series -------- 价值类因子值 0. ep_ttm: TTM净利润/总市值 1. bp_lr: 净资产(最新财报)/总市值 2. ocf_ttm: TTM经营性现金流/总市值 若计算失败,返回None """ code = Utils.code_to_symbol(code) calc_date = Utils.to_date(calc_date) # 读取TTM财务数据 ttm_fin_data = Utils.get_ttm_fin_basic_data(code, calc_date) if ttm_fin_data is None: return None # 读取最新财报数据 report_date = Utils.get_fin_report_date(calc_date) fin_basic_data = Utils.get_fin_basic_data(code, report_date) if fin_basic_data is None: return None # 计算总市值 mkt_daily = Utils.get_secu_daily_mkt(code, calc_date, fq=False, range_lookup=True) if mkt_daily.shape[0] == 0: return None cap_struct = Utils.get_cap_struct(code, calc_date) if cap_struct is None: return None total_cap = cap_struct.total - cap_struct.liquid_b - cap_struct.liquid_h total_mkt_cap = total_cap * mkt_daily.close # 计算价值类因子 ep_ttm = ttm_fin_data[ 'NetProfit'] * util_ct.FIN_DATA_AMOUNT_UNIT / total_mkt_cap ocf_ttm = ttm_fin_data[ 'NetOperateCashFlow'] * util_ct.FIN_DATA_AMOUNT_UNIT / total_mkt_cap bp_lr = fin_basic_data[ 'ShareHolderEquity'] * util_ct.FIN_DATA_AMOUNT_UNIT / total_mkt_cap return Series([round(ep_ttm, 6), round(bp_lr, 6), round(ocf_ttm, 6)], index=['ep_ttm', 'bp_lr', 'ocf_ttm'])
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股的规模因子值 Parameters: -------- :param code: str 个股代码,如600000、SH600000 :param calc_date: datetime-like, str 规模因子计算日期,格式YYYY-MM-DD或YYYYMMDD :return: pd.Series -------- 个股规模因子值,各个index对应的含义如下: 0. LnTotalMktCap: 总市值对数 1. LnLiquidMktCap: 流通市值对数 若计算失败,返回None """ # 取得证券截止指定日期最新的非复权行情数据 code = Utils.code_to_symbol(code) calc_date = Utils.to_date(calc_date) mkt_daily = Utils.get_secu_daily_mkt(code, calc_date, fq=False, range_lookup=True) if mkt_daily.shape[0] == 0: return None # 取得证券截止指定日期前最新的股本结构数据 cap_struct = Utils.get_cap_struct(code, calc_date) if cap_struct is None: return None # 计算证券的规模因子 scale_factor = Series() total_cap = cap_struct.total - cap_struct.liquid_b - cap_struct.liquid_h scale_factor['LnTotalMktCap'] = math.log(total_cap * mkt_daily.close) scale_factor['LnLiquidMktCap'] = math.log(cap_struct.liquid_a * mkt_daily.close) return scale_factor
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股的动量因子,包含短期动量和长期动量因子 Parameters: -------- :param code: str 个股代码,如SH600000或600000 :param calc_date: datetime-like or str 因子载荷计算日期,格式YYYY-MM-DD, YYYYMMDD :return: pd.Series -------- 传统动量因子值,分为短期和长期动量 0. short_term_0: 短期动量0 1. short_term_1: 短期动量1 2. long_term_0: 长期动量0 3. long_term_1: 长期动量1 若计算失败,返回None """ short_terms = [ int(x) for x in factor_ct.MOMENTUM_CT.short_term_days.split('|') ] # 短期动量的交易日天数list long_terms = [ int(x) for x in factor_ct.MOMENTUM_CT.long_term_days.split('|') ] # 长期动量的交易日天数list momentum_terms = short_terms + long_terms # 构造momentum_lable momentum_label = [] for days in short_terms: momentum_label.append('short_term_%d' % days) for days in long_terms: momentum_label.append('long_term_%d' % days) # 计算动量 momentum_value = [] for days in momentum_terms: ret = Utils.calc_interval_ret(code, end=calc_date, ndays=days) if ret is None: if len(momentum_value) == 0: return None # 如果最短期的动量计算失败,那么返回None else: ret = momentum_value[-1] momentum_value.append(round(ret, 6)) momentum = Series(momentum_value, index=momentum_label) return momentum
def _check_dlisted_indclassify(): """检查退市股票行业代码分类""" # 读取退市股票行业分类数据 cfg = ConfigParser() cfg.read('config.ini') delisted_data_path = os.path.join( cfg.get('factor_db', 'db_path'), cfg.get('industry_classify', 'classify_data_path'), 'delisted_classify_sw.csv') df_delisted_indclassify = pd.read_csv(delisted_data_path, header=0) # 读取已退市个股基本信息数据 df_stock_basics = Utils.get_stock_basics(all=True) df_delisted_basics = df_stock_basics[df_stock_basics['status'] == 3] # 检查退市股票行业分类数据中是否已包含所有的已退市股票 df_delisted_basics = df_delisted_basics[~df_delisted_basics['symbol'].isin( df_delisted_indclassify['id'].tolist())] if ~df_delisted_basics.empty: print('\033[1;31;40m个股{}已退市, 需加入退市股票行业分类数据中.\033[0m'.format( str(df_delisted_basics['symbol'].tolist())))
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股的BTOP因子载荷 Paramters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的BTOP因子载荷 0. code 1. btop 如果计算失败, 返回None """ # 读取个股的财务数据 fin_report_date = Utils.get_fin_report_date(calc_date) fin_basic_data = Utils.get_fin_basic_data(code, fin_report_date) if fin_basic_data is None: return None # 读取个股的市值因子(LNCAP) df_lncap = cls._LNCAP_Cache.get( Utils.datetimelike_to_str(calc_date, dash=False)) if df_lncap is None: lncap_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.LNCAP_CT.db_file) df_lncap = Utils.read_factor_loading( lncap_path, Utils.datetimelike_to_str(calc_date, dash=False)) cls._LNCAP_Cache.set( Utils.datetimelike_to_str(calc_date, dash=False), df_lncap) secu_lncap = df_lncap[df_lncap['id'] == Utils.code_to_symbol(code)] if secu_lncap.empty: return None flncap = secu_lncap.iloc[0]['factorvalue'] # 账面市值比=净资产/市值 btop = (fin_basic_data['TotalAsset'] - fin_basic_data['TotalLiability']) * 10000 / np.exp(flncap) return pd.Series([Utils.code_to_symbol(code), btop], index=['code', 'btop'])
def _calc_factor_loading_proc(cls, code, calc_date, q): """ 用于并行计算因子治安和 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :param q: 队列, 用于进程间通信 :return: 添加因子载荷至队列 """ logging.info('[{}] Calc BTOP factor of {}.'.format(Utils.datetimelike_to_str(calc_date), code)) btop_data = None try: btop_data = cls._calc_factor_loading(code, calc_date) except Exception as e: print(e) if btop_data is not None: q.put(btop_data)
def _calc_factor_loading_proc(cls, code, calc_date, q): """ 用于并行计算因子载荷 Parameters: -------- :param code: str 个股代码,如600000或SH600000 :param calc_date: datetime-like, str 计算日期,格式:YYYY-MM-DD or YYYYMMDD :param q: 队列,用于进程间通信 :return: 添加因子载荷至队列中 """ logging.info('[%s] Calc Growth factor of %s.' % (Utils.datetimelike_to_str(calc_date), code)) growth = None try: growth = cls._calc_factor_loading(code, calc_date) except Exception as e: print(e) if growth is not None: q.put(growth)
def _get_factor_weight(cls, date=None): """ 取得日内各时点动量因子的权重 -------- :param date: datetime-like or str 日期, 默认为None 如果date=None, 返回全部权重数据 :return: pd.Series, pd.DataFrame 各时点权重信息 -------- 0. date: 日期 1. w0: 第一个时点动量因子的权重 2. w1: 第二个时点动量因子的权重 3. w2: 第三个时点动量因子的权重 4. w3: 第四个时点动量因子的权重 5. w4: 第五个时点动量因子的权重 读取不到数据,返回None """ weight_file_path = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.INTRADAYMOMENTUM_CT.optimal_weight_file) if not os.path.isfile(weight_file_path): return None df_optimal_weight = pd.read_csv(weight_file_path, parse_dates=[0], header=0) df_optimal_weight.sort_values(by='date', inplace=True) if date is None: if df_optimal_weight.empty: return None else: return df_optimal_weight else: date = Utils.to_date(date) df_weight = df_optimal_weight[df_optimal_weight.date <= date] if df_weight.shape[0] > 0: return df_weight.iloc[-1] else: df_weight = df_optimal_weight[df_optimal_weight.date >= date] if df_weight.shape[0] > 0: return df_weight.iloc[0] else: return None
def _calc_factor_loading_proc(cls, code, calc_date, q): """ 用于并行计算因子载荷 Parameters -------- :param code: str 个股代码,如600000或SH600000 :param calc_date: datetime-like or str 计算日期 :param q: 队列,用于进程间通信 :return: 添加因子载荷至队列q中 """ logging.info('[%s] Calc SmartQ of %s.' % (calc_date.strftime('%Y-%m-%d'), code)) smart_q = None try: smart_q = cls._calc_factor_loading(code, calc_date) except Exception as e: print(e) if smart_q is not None: q.put((Utils.code_to_symbol(code), smart_q))
def _calc_factor_loading_proc(cls, code, calc_date, q): """ 用于并行计算因子载荷 Parameters: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :param q: 队列, 用于进程间通信 :return: 添加因子载荷至队列中 """ logging.info('[%s] Calc RSTR factor of %s.' % (Utils.datetimelike_to_str(calc_date), code)) rstr_data = None try: rstr_data = cls._calc_factor_loading(code, calc_date) except Exception as e: print(e) if rstr_data is not None: q.put(rstr_data)
def load_st_info(): """导入个股st带帽摘帽时间信息""" cfg = ConfigParser() cfg.read('config.ini') factor_db_path = cfg.get('factor_db', 'db_path') raw_data_path = cfg.get('st_info', 'raw_data_path') st_info_path = cfg.get('st_info', 'st_info_path') st_start_types = cfg.get('st_info', 'st_start_types').split(',') st_end_types = cfg.get('st_info', 'st_end_types').split(',') if not os.path.isfile(os.path.join(raw_data_path, 'st_info.csv')): print('\033[1;31;40mst_info.csv原始文件不存在.\033[0m') return df_st_rawinfo = pd.read_csv(os.path.join(raw_data_path, 'st_info.csv'), header=0) df_st_rawinfo = df_st_rawinfo[(df_st_rawinfo['st_info'] != '0') & (~df_st_rawinfo['st_info'].isna())] df_st_info = pd.DataFrame(columns=['code', 'st_start', 'st_end']) for _, st_data in df_st_rawinfo.iterrows(): st_start_date = None st_end_date = None code = Utils.code_to_symbol(st_data['code']) st_info_list = st_data['st_info'].split(',') st_info_list = st_info_list[::-1] for st_info in st_info_list: if ':' in st_info: st_type = st_info.split(':')[0] st_date = st_info.split(':')[1] if not (st_type in st_start_types or st_type in st_end_types): print('st type: {} is not counted.'.format(st_type)) continue if st_type in st_start_types and st_start_date is None: st_start_date = st_date elif st_type in st_end_types and st_start_date is not None: st_end_date = st_date df_st_info = df_st_info.append(pd.Series([code, st_start_date, st_end_date], index=['code', 'st_start', 'st_end']), ignore_index=True) st_start_date = None st_end_date = None if st_start_date is not None and st_end_date is None: df_st_info = df_st_info.append(pd.Series([code, st_start_date, '20301231'], index=['code', 'st_start', 'st_end']), ignore_index=True) df_st_info.to_csv(os.path.join(factor_db_path, st_info_path, 'st_info.csv'), index=False)
def _calc_periodmomentum_ic(cls, calc_date, date_interval_type='month'): """ 计算日内各时段动量因子的Rank IC值向量 Parameters: -------- :param calc_date: datetime-like, str 计算日期, e.g: YYYY-MM-DD, YYYYMMDD :param date_interval_type: str 个股收益率计算的时间长度, 'month'=月度收益, 'day'=日收益 :return: pd.Series -------- IC值向量 0. date, 日期 1. IC0, 隔夜时段动量因子IC 2. IC1, 第1小时动量因子IC 3. IC2, 第2小时动量因子IC 4. IC3, 第3小时动量因子IC 5. IC4, 第4小时动量因子IC """ # 读取日内各时段动量因子载荷数据 df_period_mom = cls._get_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), factor_name='periodmomentum', factor_type='raw', drop_na=True) if df_period_mom.empty: return None if date_interval_type == 'month': # 读取个股下个月的月度收益率数据 ret_start, ret_end = Utils.next_month(calc_date) elif date_interval_type == 'day': ret_start = ret_end = Utils.get_trading_days(start=calc_date, ndays=2)[1] df_period_mom['ret'] = np.nan for idx, factorloading_data in df_period_mom.iterrows(): fret = Utils.calc_interval_ret(factorloading_data['id'], start=ret_start, end=ret_end) if fret is not None: df_period_mom.loc[idx, 'ret'] = fret df_period_mom.dropna(inplace=True) # 计算Rank IC值 df_period_mom.drop(columns=['date', 'id', 'm_normal'], inplace=True) df_spearman_corr = df_period_mom.corr(method='spearman') rank_IC = df_spearman_corr.loc['ret', ['m0', 'm1', 'm2', 'm3', 'm4']] rank_IC['date'] = calc_date # 保存Rank IC值 ic_filepath = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.INTRADAYMOMENTUM_CT['factor_ic_file']) Utils.save_timeseries_data(rank_IC, ic_filepath, save_type='a', columns=['date', 'm0', 'm1', 'm2', 'm3', 'm4']) return rank_IC
def _get_factor_loading(cls, db_file, str_key, factor_name=None, factor_type=None, **kwargs): """ 读取因子载荷数据 Parameters: -------- :param db_file: str 因子载荷数据文件路径(绝对路径) :param str_key: str 键值, 一般为日期, e.g: YYYY-MM-DD, YYYYMMDD :param factor_name: str, 默认为None 因子名称 :param factor_type: str, 默认为None 因子类型, e.g: 'raw', 'standardized', 'orthogonalized' :param kwargs: kwargs['code']: str, 默认为None; 个股代码, e.g: SH600000, 600000 kwargs['nan_value']: object, 默认为None; 如果不为None, 那么缺失值用nan_value替换 kwargs['drop_na']: bool, 默认False; 是否删除含有NaN值的行 :return: pd.DataFrame or pd.Series, 因子载荷 -------- pd.DataFrame(code==None) or pd.Series(code!=None) 0. date 1. id 2. factorvalue """ if factor_type is not None: db_file = os.path.join(db_file, factor_type, factor_name) if 'code' not in kwargs: kwargs['code'] = None if 'na_value' not in kwargs: kwargs['na_value'] = None if 'drop_na' not in kwargs: kwargs['drop_na'] = False return Utils.read_factor_loading(db_file, str_key, kwargs['code'], kwargs['na_value'], kwargs['drop_na'])
def load_cap_struct(date): """导入个股最新股本结构数据""" date = Utils.datetimelike_to_str(date, dash=False) cfg = ConfigParser() cfg.read('config.ini') raw_data_path = cfg.get('cap_struct', 'raw_data_path') db_path = cfg.get('cap_struct', 'db_path') if not os.path.isfile(os.path.join(raw_data_path, '{}.csv'.format(date))): print('\033[1;31;40mCap struct file of %s does not exits.\033[0m' % date) return df_cap_struct = pd.read_csv(os.path.join(raw_data_path, '{}.csv'.format(date)), names=[ 'mkt', 'code', 'date', 'reason', 'total', 'liquid_a', 'liquid_b', 'liquid_h' ], header=0, encoding='GB18030', dtype={'code': str}) df_cap_struct.code = df_cap_struct.apply(lambda x: x.mkt + x.code, axis=1) del df_cap_struct['mkt'] # 先保存全部股本结构数据为一个文件 df_cap_struct.to_csv( os.path.join(db_path, 'cap_struct.csv'), index=False, header=['代码', '变更日期', '变更原因', '总股本', '流通A股', '流通B股', '流通H股']) # 然后每个个股分别保存一个股本结构数据文件 codes = df_cap_struct.code.unique() for code in codes: # print('processing capital structure data of %s.' % code) df_single_cap_struct = df_cap_struct[df_cap_struct.code == code] df_single_cap_struct.to_csv( os.path.join(db_path, code + '.csv'), index=False, header=['代码', '变更日期', '变更原因', '总股本', '流通A股', '流通B股', '流通H股'])
def smartq_backtest(start, end): """ SmartQ因子的历史回测 Parameters: -------- :param start: datetime-like, str 回测开始日期,格式:YYYY-MM-DD,开始日期应该为月初 :param end: datetime-like, str 回测结束日期,格式:YYYY-MM-DD :return: """ # 取得开始结束日期间的交易日序列 trading_days = Utils.get_trading_days(start, end) # 读取截止开始日期前最新的组合回测数据 prev_trading_day = Utils.get_prev_n_day(trading_days.iloc[0], 1) backtest_path = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.SMARTMONEY_CT.backtest_path) factor_data, port_nav = Utils.get_backtest_data(backtest_path, trading_days.iloc[0]) # factor_data = None # 记录每次调仓时最新入选个股的SmartQ因子信息,pd.DataFrame<date,factorvalue,id,buprice> if port_nav is None: port_nav = DataFrame({ 'date': [prev_trading_day.strftime('%Y-%m-%d')], 'nav': [1.0] }) # 遍历交易日,如果是月初,则读取SmartQ因子载荷值,进行调仓;如果不是月初,则进行组合估值 t = 0 # 记录调仓次数 for trading_day in trading_days: if factor_data is None: nav = port_nav[port_nav.date == prev_trading_day.strftime( '%Y-%m-%d')].iloc[0].nav else: nav = port_nav[port_nav.date == factor_data.iloc[0].date].iloc[0].nav interval_ret = 0.0 # 月初进行调仓 if Utils.is_month_start(trading_day): logging.info('[%s] 月初调仓.' % Utils.datetimelike_to_str(trading_day, True)) # 调仓前,先计算组合按均价卖出原先组合个股在当天的估值 if factor_data is not None: for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=True) if daily_mkt.date == trading_day.strftime('%Y-%m-%d'): vwap_price = daily_mkt.amount / daily_mkt.vol * daily_mkt.factor else: vwap_price = daily_mkt.close interval_ret += vwap_price / factor_info.buyprice - 1.0 interval_ret /= float(len(factor_data)) nav *= (1.0 + interval_ret) # 读取factor_data factor_data = Utils.read_factor_loading( SmartMoney.get_db_file(), Utils.datetimelike_to_str(prev_trading_day, False)) # 遍历factor_data, 计算每个个股过去20天的涨跌幅,并剔除在调仓日没有正常交易(如停牌)及涨停的个股 ind_to_be_deleted = [] factor_data['ret20'] = np.zeros(len(factor_data)) for ind, factor_info in factor_data.iterrows(): trading_status = Utils.trading_status(factor_info.id, trading_day) if trading_status == SecuTradingStatus.Suspend or trading_status == SecuTradingStatus.LimitUp: ind_to_be_deleted.append(ind) fret20 = Utils.calc_interval_ret(factor_info.id, end=prev_trading_day, ndays=20) if fret20 is None: if ind not in ind_to_be_deleted: ind_to_be_deleted.append(ind) else: factor_data.loc[ind, 'ret20'] = fret20 factor_data = factor_data.drop(ind_to_be_deleted, axis=0) # 对factor_data过去20天涨跌幅降序排列,剔除涨幅最大的20%个股 k = int(factor_data.shape[0] * 0.2) factor_data = factor_data.sort_values(by='ret20', ascending=False).iloc[k:] del factor_data['ret20'] # 删除ret20列 # 对factor_data按因子值升序排列,取前10%个股 factor_data = factor_data.sort_values(by='factorvalue', ascending=True) k = int(factor_data.shape[0] * 0.1) factor_data = factor_data.iloc[:k] # 遍历factor_data,添加买入价格,并估值计算当天调仓后的组合收益 factor_data['buyprice'] = 0.0 interval_ret = 0.0 for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=False) assert len(daily_mkt) > 0 factor_data.loc[ ind, 'buyprice'] = daily_mkt.amount / daily_mkt.vol * daily_mkt.factor interval_ret += daily_mkt.close / factor_data.loc[ ind, 'buyprice'] - 1.0 interval_ret /= float(factor_data.shape[0]) nav *= (1.0 + interval_ret) # 保存factor_data port_data_path = os.path.join( SETTINGS.FACTOR_DB_PATH, alphafactor_ct.SMARTMONEY_CT.backtest_path, 'port_data_%s.csv' % Utils.datetimelike_to_str(trading_day, False)) factor_data.to_csv(port_data_path, index=False) t += 1 if t % 6 == 0: logging.info('Suspended for 300s.') time.sleep(300) else: # 非调仓日,对组合进行估值 logging.info('[%s] 月中估值.' % Utils.datetimelike_to_str(trading_day, True)) if factor_data is not None: for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=True) interval_ret += daily_mkt.close / factor_info.buyprice - 1.0 interval_ret /= float(factor_data.shape[0]) nav *= (1.0 + interval_ret) # 添加nav port_nav = port_nav.append(Series({ 'date': Utils.datetimelike_to_str(trading_day, True), 'nav': nav }), ignore_index=True) # 设置prev_trading_day prev_trading_day = trading_day # 保存port_nav port_nav_path = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.SMARTMONEY_CT.backtest_path, 'port_nav.csv') port_nav.to_csv(port_nav_path, index=False)
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷 :param save: 是否保存至因子数据库,默认为False :param kwargs: 'multi_proc': bool, True=采用多进程并行计算, False=采用单进程计算, 默认为False :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date, 日期, 为计算日期的下一个交易日 1: id, 证券代码 2: factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ # 0.取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 取得样本个股信息 # all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算SMartQ因子载荷 dict_factor = None for calc_date in trading_days_series: dict_factor = {'id': [], 'factorvalue': []} if month_end and (not Utils.is_month_end(calc_date)): continue # 1.获取用于读取分钟行情的交易日列表(过去30天的交易日列表,降序排列) # trading_days = _get_trading_days(calc_date, 30) # trading_days = Utils.get_trading_days(end=calc_date, ndays=30, ascending=False) # 2.取得样本个股信息 # stock_basics = ts.get_stock_basics() s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = Utils.get_stock_basics(s) # 3.遍历样本个股代码,计算Smart_Q因子载荷值 dict_factor = {'date': None, 'id': [], 'factorvalue': []} if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程进行计算 for _, stock_info in stock_basics.iterrows(): # code = '%s%s' % ('SH' if code[:2] == '60' else 'SZ', code) factor_loading = cls._calc_factor_loading( stock_info.symbol, calc_date) print( "[%s]Calculating %s's SmartMoney factor loading = %.4f." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, -1.0 if factor_loading is None else factor_loading)) if factor_loading is not None: # df_factor.ix[code, 'factorvalue'] = factor_loading dict_factor['id'].append( Utils.code_to_symbol(stock_info.symbol)) dict_factor['factorvalue'].append(factor_loading) else: # 采用多进程并行计算SmartQ因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷值 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): smart_q = q.get(True) dict_factor['id'].append(smart_q[0]) dict_factor['factorvalue'].append(smart_q[1]) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_factor['date'] = [date_label] * len(dict_factor['id']) # 4.计算去极值标准化后的因子载荷 df_std_factor = Utils.normalize_data(pd.DataFrame(dict_factor), columns='factorvalue', treat_outlier=True, weight='eq') # 5.保存因子载荷至因子数据库 if save: # Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_factor) cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_factor, 'SmartMoney', factor_type='raw', columns=['date', 'id', 'factorvalue']) cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), df_std_factor, 'SmartMoney', factor_type='standardized', columns=['date', 'id', 'factorvalue']) # 休息300秒 logging.info('Suspending for 360s.') time.sleep(360) return dict_factor
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算DASTD因子载荷 dict_dastd = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc DASTD factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股, 计算个股的DASTD因子值 s = (calc_date - datetime.timedelta(days=risk_ct.DASTD_CT.listed_days)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] ids = [] # 个股代码list dastds = [] # DASTD因子值list if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程计算DASTD因子值 for _, stock_info in stock_basics.iterrows(): logging.info("[%s] Calc %s's DASTD factor loading." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) dastd_data = cls._calc_factor_loading(stock_info.symbol, calc_date) if dastd_data is None: ids.append(Utils.code_to_symbol(stock_info.symbol)) dastds.append(np.nan) else: ids.append(dastd_data['code']) dastds.append(dastd_data['dastd']) else: # 采用多进程并行计算DASTD因子值 q = Manager().Queue() # 队列, 用于进程间通信, 存储每个进程计算的因子载荷 p = Pool(4) # 进程池, 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=(stock_info.symbol, calc_date, q,)) p.close() p.join() while not q.empty(): dastd_data = q.get(True) ids.append(dastd_data['code']) dastds.append(dastd_data['dastd']) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_dastd = {'date': [date_label]*len(ids), 'id': ids, 'factorvalue': dastds} if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_dastd, ['date', 'id', 'factorvalue']) # 暂停180秒 logging.info('Suspending for 180s.') # time.sleep(180) return dict_dastd
def calc_factor_loading_(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算ResVolatility因子下各个成分因子的因子载荷 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 计算各成分因子的因子载荷 for com_factor in risk_ct.RESVOLATILITY_CT.component: factor = eval(com_factor + '()') factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) # 合成ResVolatility因子载荷 resvol_factor = pd.DataFrame() for com_factor in risk_ct.RESVOLATILITY_CT.component: factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, eval('risk_ct.' + com_factor + '_CT')['db_file']) factor_loading = Utils.read_factor_loading(factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) factor_loading.drop(columns='date', inplace=True) factor_loading[com_factor] = Utils.normalize_data(Utils.clean_extreme_value(np.array(factor_loading['factorvalue']).reshape((len(factor_loading), 1)))) factor_loading.drop(columns='factorvalue', inplace=True) if resvol_factor.empty: resvol_factor = factor_loading else: resvol_factor = pd.merge(left=resvol_factor, right=factor_loading, how='inner', on='id') resvol_factor.set_index('id', inplace=True) weight = pd.Series(risk_ct.RESVOLATILITY_CT.weight) resvol_factor = (resvol_factor * weight).sum(axis=1) resvol_factor.name = 'factorvalue' resvol_factor.index.name = 'id' resvol_factor = pd.DataFrame(resvol_factor) resvol_factor.reset_index(inplace=True) resvol_factor['date'] = Utils.get_trading_days(start=calc_date, ndays=2)[1] # 保存ResVolatility因子载荷 if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), resvol_factor.to_dict('list'),['date', 'id', 'factorvalue'])
def _calc_factor_loading(cls, code, calc_date): """ Parameter: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的CMRA因子载荷 0. code 1. cmra 如果计算失败, 返回None """ # 取得个股日复权行情数据 # df_secu_quote = Utils.get_secu_daily_mkt(code, end=calc_date,ndays=risk_ct.CMRA_CT.trailing*risk_ct.CMRA_CT.days_scale+1, fq=True) # if df_secu_quote is None: # return None # if len(df_secu_quote) < risk_ct.CMRA_CT.listed_days: # return None # df_secu_quote.reset_index(drop=True, inplace=True) # 计算个股的日对数收益率序列 # arr_secu_close = np.array(df_secu_quote.iloc[1:]['close']) # arr_secu_preclose = np.array(df_secu_quote.shift(1).iloc[1:]['close']) # arr_secu_daily_ret = np.log(arr_secu_close / arr_secu_preclose) # 每个月计算累积收益率 # z = [] # for t in range(1, risk_ct.CMRA_CT.trailing+1): # k = t * risk_ct.CMRA_CT.days_scale - 1 # if k > len(arr_secu_daily_ret) - 1: # k = len(arr_secu_daily_ret) - 1 # z.append(np.sum(arr_secu_daily_ret[:k])) # break # else: # z.append(np.sum(arr_secu_daily_ret[:k])) # 计算每个月的个股价格变化率(1+r) # z = [] # for t in range(1, risk_ct.CMRA_CT.trailing+1): # k = t * risk_ct.CMRA_CT.days_scale # if k > len(df_secu_quote)-1: # k = len(df_secu_quote)-1 # z.append(df_secu_quote.iloc[k]['close']/df_secu_quote.iloc[0]['close']) # break # else: # z.append(df_secu_quote.iloc[k]['close']/df_secu_quote.iloc[0]['close']) # cmra = np.log(max(z)) - np.log(min(z)) # 取得交易日序列 trading_days = Utils.get_trading_days(end=calc_date, ndays=risk_ct.CMRA_CT.trailing*risk_ct.CMRA_CT.days_scale+1) trading_days = [day.strftime('%Y-%m-%d') for day in trading_days] # 取得个股复权行情数据 df_secu_quote = Utils.get_secu_daily_mkt(code, end=calc_date, fq=True) # 提取相应交易日的个股复权行情数据 df_secu_quote = df_secu_quote[df_secu_quote['date'].isin(trading_days)] df_secu_quote.reset_index(drop=True, inplace=True) # 计算个股每个月的个股价格变化率 z = [] if len(df_secu_quote) < int(risk_ct.CMRA_CT.trailing*risk_ct.CMRA_CT.days_scale/2): # 如果提取的个股复权行情长度小于所需时间长度的一半(126个交易日), 返回None return None else: prev_trading_day = df_secu_quote.iloc[0]['date'] for t in range(1, risk_ct.CMRA_CT.trailing+1): k = t * risk_ct.CMRA_CT.days_scale trading_day = trading_days[k] if trading_day < df_secu_quote.iloc[0]['date']: continue # try: secu_trading_day = df_secu_quote[df_secu_quote['date'] <= trading_day].iloc[-1]['date'] if secu_trading_day <= prev_trading_day: continue else: ret = df_secu_quote[df_secu_quote['date']==secu_trading_day].iloc[0]['close']/df_secu_quote.iloc[0]['close'] z.append(ret) prev_trading_day = secu_trading_day # except Exception as e: # print(e) cmra = math.log(max(z)) - math.log(min(z)) return pd.Series([Utils.code_to_symbol(code), cmra], index=['code', 'cmra'])
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算LIQUIDITY因子载荷 dict_raw_liquidity = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_stom = None dict_stoq = None dict_stoa = None dict_raw_liquidity = None logging.info('[%s] Calc LIQUIDITY factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股,计算个股LIQUIDITY因子值 s = (calc_date - datetime.timedelta( days=risk_ct.LIQUID_CT.listed_days)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] ids = [] stoms = [] stoqs = [] stoas = [] raw_liquidities = [] if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程计算LIQUIDITY因子值 for _, stock_info in stock_basics.iterrows(): logging.info("[%s] Calc %s's LIQUIDITY factor loading." % (Utils.datetimelike_to_str( calc_date, dash=True), stock_info.symbol)) liquidity_data = cls._calc_factor_loading( stock_info.symbol, calc_date) if liquidity_data is not None: ids.append(liquidity_data['code']) stoms.append(liquidity_data['stom']) stoqs.append(liquidity_data['stoq']) stoas.append(liquidity_data['stoa']) raw_liquidities.append(liquidity_data['liquidity']) else: # 采用多进程计算LIQUIDITY因子值 q = Manager().Queue() p = Pool(4) for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): liquidity_data = q.get(True) ids.append(liquidity_data['code']) stoms.append(liquidity_data['stom']) stoqs.append(liquidity_data['stoq']) stoas.append(liquidity_data['stoa']) raw_liquidities.append(liquidity_data['liquidity']) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_stom = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoms }) dict_stoq = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoqs }) dict_stoa = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoas }) dict_raw_liquidity = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': raw_liquidities }) # 读取Size因子值, 将流动性因子与Size因子正交化 size_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.SIZE_CT.db_file) df_size = Utils.read_factor_loading( size_factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) df_size.drop(columns='date', inplace=True) df_size.rename(columns={'factorvalue': 'size'}, inplace=True) df_liquidity = pd.DataFrame( dict({ 'id': ids, 'liquidity': raw_liquidities })) df_liquidity = pd.merge(left=df_liquidity, right=df_size, how='inner', on='id') arr_liquidity = Utils.normalize_data( Utils.clean_extreme_value( np.array(df_liquidity['liquidity']).reshape( (len(df_liquidity), 1)))) arr_size = Utils.normalize_data( Utils.clean_extreme_value( np.array(df_liquidity['size']).reshape( (len(df_liquidity), 1)))) model = sm.OLS(arr_liquidity, arr_size) results = model.fit() df_liquidity['liquidity'] = results.resid df_liquidity.drop(columns='size', inplace=True) df_liquidity.rename(columns={'liquidity': 'factorvalue'}, inplace=True) df_liquidity['date'] = date_label # 保存因子载荷 if save: str_date = Utils.datetimelike_to_str(calc_date, dash=False) factor_header = ['date', 'id', 'factorvalue'] Utils.factor_loading_persistent(cls._db_file, 'stom_{}'.format(str_date), dict_stom, factor_header) Utils.factor_loading_persistent(cls._db_file, 'stoq_{}'.format(str_date), dict_stoq, factor_header) Utils.factor_loading_persistent(cls._db_file, 'stoa_{}'.format(str_date), dict_stoa, factor_header) Utils.factor_loading_persistent( cls._db_file, 'rawliquidity_{}'.format(str_date), dict_raw_liquidity, factor_header) Utils.factor_loading_persistent(cls._db_file, str_date, df_liquidity.to_dict('list'), factor_header) # 暂停180秒 logging.info('Suspending for 180s.') time.sleep(180) return dict_raw_liquidity
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本股的因子载荷,并保存至因子数据库 Parameters: -------- :param start_date: datetime-like or str 开始日期,格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like or str 结束日期,格式:YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认True 如果为True,则只计算月末时点的因子载荷;否则每个交易日都计算 :param save: bool, 默认False 是否保存至因子数据库 :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date: 日期 1. id: 证券symbol 2. LnTotalMktCap: 总市值对数值 3. LnLiquidMktCap: 流通市值对数值 """ # 取得交易日序列股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算规模因子值 dict_scale = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_scale = { 'date': [], 'id': [], 'LnTotalMktCap': [], 'LnLiquidMktCap': [] } # 遍历个股,计算个股规模因子值 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 采用单进程进行计算规模因子 # for _, stock_info in stock_basics.iterrows(): # scale_data = cls._calc_factor_loading(stock_info.symbol, calc_date) # if scale_data is not None: # logging.info("[%s] %s's total mkt cap = %.0f, liquid mkt cap = %.0f" % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, scale_data.LnTotalMktCap, scale_data.LnLiquidMktCap)) # dict_scale['id'].append(Utils.code_to_symbol(stock_info.symbol)) # dict_scale['LnTotalMktCap'].append(round(scale_data.LnTotalMktCap, 4)) # dict_scale['LnLiquidMktCap'].append(round(scale_data.LnLiquidMktCap, 4)) # 采用多进程并行计算规模因子 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的规模因子值 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): scale_data = q.get(True) dict_scale['id'].append(scale_data[0]) dict_scale['LnTotalMktCap'].append(round(scale_data[1], 4)) dict_scale['LnLiquidMktCap'].append(round(scale_data[2], 4)) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_scale['date'] = [date_label] * len(dict_scale['id']) # 保存规模因子载荷至因子数据库 if save: Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_scale) # 休息60秒 logging.info('Suspending for 60s.') time.sleep(60) return dict_scale
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷 :param save: 是否保存至因子数据库,默认为False :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0: ID, 证券ID,为索引 1: factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ # 0.取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 取得样本个股信息 all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算SMartQ因子载荷 dict_factor = None for calc_date in trading_days_series: dict_factor = {'id': [], 'factorvalue': []} if month_end and (not Utils.is_month_end(calc_date)): continue # 1.获取用于读取分钟行情的交易日列表(过去30天的交易日列表,降序排列) # trading_days = _get_trading_days(calc_date, 30) # trading_days = Utils.get_trading_days(end=calc_date, ndays=30, ascending=False) # 2.取得样本个股信息 # stock_basics = ts.get_stock_basics() s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 3.遍历样本个股代码,计算Smart_Q因子载荷值 dict_factor = {'id': [], 'factorvalue': []} # 采用单进程进行计算 # for _, stock_info in stock_basics.iterrows(): # # code = '%s%s' % ('SH' if code[:2] == '60' else 'SZ', code) # factor_loading = cls._calc_factor_loading(stock_info.symbol, calc_date) # print("[%s]Calculating %s's SmartMoney factor loading = %.4f." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, -1.0 if factor_loading is None else factor_loading)) # if factor_loading is not None: # # df_factor.ix[code, 'factorvalue'] = factor_loading # dict_factor['id'].append(Utils.code_to_symbol(stock_info.symbol)) # dict_factor['factorvalue'].append(factor_loading) # 采用多进程并行计算SmartQ因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷值 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): smart_q = q.get(True) dict_factor['id'].append(smart_q[0]) dict_factor['factorvalue'].append(smart_q[1]) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_factor['date'] = [date_label] * len(dict_factor['id']) # 4.保存因子载荷至因子数据库 if save: # db = shelve.open(cls._db_file, flag='c', protocol=None, writeback=False) # try: # db[calc_date.strftime('%Y%m%d')] = df_factor # finally: # db.close() Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_factor) # 休息300秒 logging.info('Suspending for 360s.') time.sleep(360) return dict_factor
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期,格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期,格式:YYYY-MM-DD or YYYYMMDD 如果为None,则只计算start_date日期的因子载荷 :param month_end:bool, 默认True 如果为True,则只结算月末时点的因子载荷 :param save: bool, 默认False 是否保存至因子数据库 :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date: 日期 1. id: 证券symbol 2. short_term_0: 第一个短期动量因子 3. short_term_1: 第二个短期动量因子 4. long_term_0: 第一个长期动量因子 5. long_term_1: 第二个长期动量因子 """ # 取得交易日序列及股票基本信息表 # start_date = Utils.to_date(start_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算动量因子 dict_momentum = None momentum_label = cls.momentum_label() for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_momentum = {'date': [], 'id': []} for label in momentum_label: dict_momentum[label] = [] # 遍历个股,计算个股动量因子 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 采用单进程进行计算 # for _, stock_info in stock_basics.iterrows(): # momentum_data = cls._calc_factor_loading(stock_info.symbol, calc_date) # if momentum_data is not None: # logging.info("[%s] calc %s's momentum factor loading." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) # dict_momentum['id'].append(Utils.code_to_symbol(stock_info.symbol)) # for label in momentum_label: # dict_momentum[label].append(momentum_data[label]) # 采用多进程并行计算动量因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): momentum_data = q.get(True) dict_momentum['id'].append(momentum_data['id']) for label in momentum_label: dict_momentum[label].append(momentum_data[label]) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_momentum['date'] = [date_label] * len(dict_momentum['id']) # 保存因子载荷至因子数据库 if save: Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_momentum) # 休息60秒 logging.info('Suspending for 60s.') time.sleep(60) return dict_momentum
def _get_prevN_years_finbasicdata(date, code, years): """ 读取过去n年的主要财务指标数据, 其中每股数据会经过复权因子调整 :param date: datetime-like 日期 :param code: str 个股代码, 格式: SH600000 :param years: int 返回的报告期年数 :return: list of pd.Series """ year = date.year month = date.month if month in (1, 2, 3, 4): # report_dates = [datetime.datetime(year-5, 12, 31), # datetime.datetime(year-4, 12, 31), # datetime.datetime(year-3, 12, 31), # datetime.datetime(year-2, 12, 31)] report_dates = [ datetime.datetime(year - n, 12, 31) for n in range(years, 1, -1) ] is_ttm = True elif month in (5, 6, 7, 8): # report_dates = [datetime.datetime(year-5, 12, 31), # datetime.datetime(year-4, 12, 31), # datetime.datetime(year-3, 12, 31), # datetime.datetime(year-2, 12, 31), # datetime.datetime(year-1, 12, 31)] report_dates = [ datetime.datetime(year - n, 12, 31) for n in range(years, 0, -1) ] is_ttm = False else: # report_dates = [datetime.datetime(year-4, 12, 31), # datetime.datetime(year-3, 12, 31), # datetime.datetime(year-2, 12, 31), # datetime.datetime(year-1, 12, 31)] report_dates = [ datetime.datetime(year - n, 12, 31) for n in range(years - 1, 0, -1) ] is_ttm = True df_mkt_data = Utils.get_secu_daily_mkt(code, end=date, fq=True) # 个股复权行情, 用于调整每股数据 prevN_years_finbasicdata = [] for report_date in report_dates: fin_basic_data = Utils.get_fin_basic_data(code, report_date, date_type='report_date') if fin_basic_data is None: return None fin_basic_data = fin_basic_data.to_dict() df_extract_mkt = df_mkt_data[ df_mkt_data.date <= report_date.strftime('%Y-%m-%d')] if not df_extract_mkt.empty: fq_factor = df_extract_mkt.iloc[-1]['factor'] # 调整每股数据 fin_basic_data[ 'BasicEPS_adj'] = fin_basic_data['BasicEPS'] * fq_factor fin_basic_data['UnitNetAsset_adj'] = fin_basic_data[ 'UnitNetAsset'] * fq_factor fin_basic_data['UnitNetOperateCashFlow_adj'] = fin_basic_data[ 'UnitNetOperateCashFlow'] * fq_factor # 计算调整后的主营业务收入 fin_basic_data['MainOperateRevenue_adj'] = fin_basic_data[ 'MainOperateRevenue'] / fq_factor else: fin_basic_data['BasicEPS_adj'] = fin_basic_data['BasicEPS'] fin_basic_data['UnitNetAsset_adj'] = fin_basic_data['UnitNetAsset'] fin_basic_data['UnitNetOperateCashFlow_adj'] = fin_basic_data[ 'UnitNetOperateCashFlow'] fin_basic_data['MainOperateRevenue_adj'] = fin_basic_data[ 'MainOperateRevenue'] prevN_years_finbasicdata.append(fin_basic_data) if is_ttm: ttm_fin_basic_data = Utils.get_ttm_fin_basic_data(code, date) if ttm_fin_basic_data is None: return None ttm_fin_basic_data = ttm_fin_basic_data.to_dict() df_extract_mkt = df_mkt_data[ df_mkt_data.date <= ttm_fin_basic_data['ReportDate'].strftime( '%Y-%m-%d')] if not df_extract_mkt.empty: fq_factor = df_extract_mkt.iloc[-1]['factor'] # 调整每股数据 ttm_fin_basic_data[ 'BasicEPS_adj'] = ttm_fin_basic_data['BasicEPS'] * fq_factor # 计算调整后的主营业务收入 ttm_fin_basic_data['MainOperateRevenue_adj'] = ttm_fin_basic_data[ 'MainOperateRevenue'] / fq_factor else: ttm_fin_basic_data['BasicEPS_adj'] = ttm_fin_basic_data['BasicEPS'] ttm_fin_basic_data['MainOperateRevenue_adj'] = ttm_fin_basic_data[ 'MainOperateRevenue'] prevN_years_finbasicdata.append(ttm_fin_basic_data) return prevN_years_finbasicdata
def calc_factor_loading_(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算growth因子下各个成分因子的因子载荷 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 计算各成分因子的因子载荷 for com_factor in risk_ct.GROWTH_CT.component: factor = eval(com_factor + '()') factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) # 合成Growth因子载荷 growth_factor = pd.DataFrame() df_industry_classify = Utils.get_industry_classify() # 个股行业分类数据 for com_factor in risk_ct.GROWTH_CT.component: factor_path = os.path.join( factor_ct.FACTOR_DB.db_path, eval('risk_ct.' + com_factor + '_CT')['db_file']) factor_loading = Utils.read_factor_loading( factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) factor_loading.drop(columns='date', inplace=True) # factor_loading[com_factor] = Utils.normalize_data(Utils.clean_extreme_value(np.array(factor_loading['factorvalue']).reshape((len(factor_loading), 1)))) # factor_loading.drop(columns='factorvalue', inplace=True) factor_loading.rename(columns={'factorvalue': com_factor}, inplace=True) # 添加行业分类数据 factor_loading = pd.merge( left=factor_loading, right=df_industry_classify[['id', 'ind_code']], how='inner', on='id') # 取得含缺失值的因子载荷数据 missingdata_factor = factor_loading[ factor_loading[com_factor].isna()] # 删除factor_loading中的缺失值 factor_loading.dropna(axis='index', how='any', inplace=True) # 对factor_loading去极值、标准化 factor_loading = Utils.normalize_data(factor_loading, id='id', columns=com_factor, treat_outlier=True, weight='cap', calc_date=calc_date) # 把missingdata_factor中的缺失值替换为行业均值 ind_codes = set(missingdata_factor['ind_code']) ind_mean_factor = {} for ind_code in ind_codes: ind_mean_factor[ind_code] = factor_loading[ factor_loading['ind_code'] == ind_code][com_factor].mean() for idx, missingdata in missingdata_factor.iterrows(): missingdata_factor.loc[idx, com_factor] = ind_mean_factor[ missingdata['ind_code']] # 把missingdata_factor和factor_loading合并 factor_loading = pd.concat( [factor_loading, missingdata_factor]) # 删除ind_code列 factor_loading.drop(columns='ind_code', inplace=True) if growth_factor.empty: growth_factor = factor_loading else: growth_factor = pd.merge(left=growth_factor, right=factor_loading, how='inner', on='id') # # 读取个股行业分类数据, 添加至growth_factor中 # df_industry_classify = Utils.get_industry_classify() # growth_factor = pd.merge(left=growth_factor, right=df_industry_classify[['id', 'ind_code']]) # # 取得含缺失值的因子载荷数据 # missingdata_factor = growth_factor.loc[[ind for ind, data in growth_factor.iterrows() if data.hasnans]] # # 删除growth_factot中的缺失值 # growth_factor.dropna(axis='index', how='any', inplace=True) # # 对growth_factor去极值、标准化 # growth_factor = Utils.normalize_data(growth_factor, id='id', columns=risk_ct.GROWTH_CT.component, treat_outlier=True, weight='cap', calc_date=calc_date) # # 把missingdata_factor中的缺失值替换为行业均值 # ind_codes = set(missingdata_factor['ind_code']) # ind_mean_factor = {} # for ind_code in ind_codes: # ind_mean_factor[ind_code] = growth_factor[growth_factor['ind_code'] == ind_code].mean() # missingdata_label = {ind: missingdata_factor.columns[missingdata.isna()].tolist() for ind, missingdata in missingdata_factor.iterrows()} # for ind, cols in missingdata_label.items(): # missingdata_factor.loc[ind, cols] = ind_mean_factor[missingdata_factor.loc[ind, 'ind_code']][cols] # # 把missingdata_factor和growth_factor合并 # growth_factor = pd.concat([growth_factor, missingdata_factor]) # # 删除ind_code列 # growth_factor.drop(columns='ind_code', inplace=True) # 合成Growth因子 growth_factor.set_index('id', inplace=True) weight = pd.Series(risk_ct.GROWTH_CT.weight) growth_factor = (growth_factor * weight).sum(axis=1) growth_factor.name = 'factorvalue' growth_factor.index.name = 'id' growth_factor = pd.DataFrame(growth_factor) growth_factor.reset_index(inplace=True) growth_factor['date'] = Utils.get_trading_days(start=calc_date, ndays=2)[1] # 保存growth因子载荷 if save: Utils.factor_loading_persistent( cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), growth_factor.to_dict('list'), ['date', 'id', 'factorvalue'])