def calc_factor_loading_(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算ResVolatility因子下各个成分因子的因子载荷 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 计算各成分因子的因子载荷 for com_factor in risk_ct.RESVOLATILITY_CT.component: factor = eval(com_factor + '()') factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) # 合成ResVolatility因子载荷 resvol_factor = pd.DataFrame() for com_factor in risk_ct.RESVOLATILITY_CT.component: factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, eval('risk_ct.' + com_factor + '_CT')['db_file']) factor_loading = Utils.read_factor_loading(factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) factor_loading.drop(columns='date', inplace=True) factor_loading[com_factor] = Utils.normalize_data(Utils.clean_extreme_value(np.array(factor_loading['factorvalue']).reshape((len(factor_loading), 1)))) factor_loading.drop(columns='factorvalue', inplace=True) if resvol_factor.empty: resvol_factor = factor_loading else: resvol_factor = pd.merge(left=resvol_factor, right=factor_loading, how='inner', on='id') resvol_factor.set_index('id', inplace=True) weight = pd.Series(risk_ct.RESVOLATILITY_CT.weight) resvol_factor = (resvol_factor * weight).sum(axis=1) resvol_factor.name = 'factorvalue' resvol_factor.index.name = 'id' resvol_factor = pd.DataFrame(resvol_factor) resvol_factor.reset_index(inplace=True) resvol_factor['date'] = Utils.get_trading_days(start=calc_date, ndays=2)[1] # 保存ResVolatility因子载荷 if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), resvol_factor.to_dict('list'),['date', 'id', 'factorvalue'])
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算LIQUIDITY因子载荷 dict_raw_liquidity = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_stom = None dict_stoq = None dict_stoa = None dict_raw_liquidity = None logging.info('[%s] Calc LIQUIDITY factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股,计算个股LIQUIDITY因子值 s = (calc_date - datetime.timedelta( days=risk_ct.LIQUID_CT.listed_days)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] ids = [] stoms = [] stoqs = [] stoas = [] raw_liquidities = [] if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程计算LIQUIDITY因子值 for _, stock_info in stock_basics.iterrows(): logging.info("[%s] Calc %s's LIQUIDITY factor loading." % (Utils.datetimelike_to_str( calc_date, dash=True), stock_info.symbol)) liquidity_data = cls._calc_factor_loading( stock_info.symbol, calc_date) if liquidity_data is not None: ids.append(liquidity_data['code']) stoms.append(liquidity_data['stom']) stoqs.append(liquidity_data['stoq']) stoas.append(liquidity_data['stoa']) raw_liquidities.append(liquidity_data['liquidity']) else: # 采用多进程计算LIQUIDITY因子值 q = Manager().Queue() p = Pool(4) for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): liquidity_data = q.get(True) ids.append(liquidity_data['code']) stoms.append(liquidity_data['stom']) stoqs.append(liquidity_data['stoq']) stoas.append(liquidity_data['stoa']) raw_liquidities.append(liquidity_data['liquidity']) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_stom = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoms }) dict_stoq = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoqs }) dict_stoa = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoas }) dict_raw_liquidity = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': raw_liquidities }) # 读取Size因子值, 将流动性因子与Size因子正交化 size_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.SIZE_CT.db_file) df_size = Utils.read_factor_loading( size_factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) df_size.drop(columns='date', inplace=True) df_size.rename(columns={'factorvalue': 'size'}, inplace=True) df_liquidity = pd.DataFrame( dict({ 'id': ids, 'liquidity': raw_liquidities })) df_liquidity = pd.merge(left=df_liquidity, right=df_size, how='inner', on='id') arr_liquidity = Utils.normalize_data( Utils.clean_extreme_value( np.array(df_liquidity['liquidity']).reshape( (len(df_liquidity), 1)))) arr_size = Utils.normalize_data( Utils.clean_extreme_value( np.array(df_liquidity['size']).reshape( (len(df_liquidity), 1)))) model = sm.OLS(arr_liquidity, arr_size) results = model.fit() df_liquidity['liquidity'] = results.resid df_liquidity.drop(columns='size', inplace=True) df_liquidity.rename(columns={'liquidity': 'factorvalue'}, inplace=True) df_liquidity['date'] = date_label # 保存因子载荷 if save: str_date = Utils.datetimelike_to_str(calc_date, dash=False) factor_header = ['date', 'id', 'factorvalue'] Utils.factor_loading_persistent(cls._db_file, 'stom_{}'.format(str_date), dict_stom, factor_header) Utils.factor_loading_persistent(cls._db_file, 'stoq_{}'.format(str_date), dict_stoq, factor_header) Utils.factor_loading_persistent(cls._db_file, 'stoa_{}'.format(str_date), dict_stoa, factor_header) Utils.factor_loading_persistent( cls._db_file, 'rawliquidity_{}'.format(str_date), dict_raw_liquidity, factor_header) Utils.factor_loading_persistent(cls._db_file, str_date, df_liquidity.to_dict('list'), factor_header) # 暂停180秒 logging.info('Suspending for 180s.') time.sleep(180) return dict_raw_liquidity
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算NLSIZE因子载荷 dict_nlsize = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc NLSIZE factor loading.' % Utils.datetimelike_to_str(calc_date)) # 读取Size因子载荷数据 lncap_data_path = os.path.join( factor_ct.FACTOR_DB.db_path, '{}_{}.csv'.format( risk_ct.LNCAP_CT.db_file, Utils.datetimelike_to_str(calc_date, dash=False))) if not os.path.exists(lncap_data_path): logging.info('[%s] 的Size因子载荷数据不存在.' % Utils.datetimelike_to_str(calc_date)) continue df_lncap = pd.read_csv(lncap_data_path, header=0) # Size因子数组 arr_size = np.array(df_lncap['factorvalue']) # Size因子三次方数组 arr_size_cube = arr_size**3 # 相对Size因子正交化 model = sm.OLS(arr_size_cube, arr_size) result = model.fit() # 对残差值进行缩尾处理和标准化 n = len(result.resid) arr_resid = result.resid.reshape(n, 1) arr_resid_winsorized = Utils.clean_extreme_value(arr_resid) arr_resid_standardized = Utils.normalize_data(arr_resid_winsorized) # 保存NLSIZE因子载荷数据 dict_nlsize = dict({ 'date': df_lncap['date'].values, 'id': df_lncap['id'].values, 'factorvalue': arr_resid_standardized.reshape(n, ) }) if save: Utils.factor_loading_persistent( cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_nlsize, ['date', 'id', 'factorvalue'])
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷,该参数只在end_date不为None时有效,并且不论end_date是否为None,都会计算第一天的因子载荷 :param save: 是否保存至因子数据库,默认为False :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0: id, 证券ID 1: factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ # 1.取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 2.遍历交易日序列,计算APM因子载荷 dict_apm = None for calc_date in trading_days_series: dict_apm = {'date': [], 'id': [], 'factorvalue': []} if month_end and (not Utils.is_month_end(calc_date)): continue # 2.1.遍历个股,计算个股APM.stat统计量,过去20日收益率,分别放进stat_lst,ret20_lst列表中 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] stat_lst = [] ret20_lst = [] symbol_lst = [] # 采用单进程计算 # for _, stock_info in stock_basics.iterrows(): # stat_i = cls._calc_factor_loading(stock_info.symbol, calc_date) # ret20_i = Utils.calc_interval_ret(stock_info.symbol, end=calc_date, ndays=20) # if stat_i is not None and ret20_i is not None: # stat_lst.append(stat_i) # ret20_lst.append(ret20_i) # symbol_lst.append(Utils.code_to_symbol(stock_info.symbol)) # logging.info('APM of %s = %f' % (stock_info.symbol, stat_i)) # 采用多进程并行计算 q = Manager().Queue() p = Pool(4) # 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): apm_value = q.get(True) symbol_lst.append(apm_value[0]) stat_lst.append(apm_value[1]) ret20_lst.append(apm_value[2]) assert len(stat_lst) == len(ret20_lst) assert len(stat_lst) == len(symbol_lst) # 2.2.构建APM因子 # 2.2.1.将统计量stat对动量因子ret20j进行截面回归:stat_j = \beta * Ret20_j + \epsilon_j # 残差向量即为对应个股的APM因子 # 截面回归之前,先对stat统计量和动量因子进行去极值和标准化处理 stat_arr = np.array(stat_lst).reshape((len(stat_lst), 1)) ret20_arr = np.array(ret20_lst).reshape((len(ret20_lst), 1)) stat_arr = Utils.clean_extreme_value(stat_arr) stat_arr = Utils.normalize_data(stat_arr) ret20_arr = Utils.clean_extreme_value(ret20_arr) ret20_arr = Utils.normalize_data(ret20_arr) # 回归分析 # ret20_arr = sm.add_constant(ret20_arr) apm_model = sm.OLS(stat_arr, ret20_arr) apm_result = apm_model.fit() apm_lst = list(np.around(apm_result.resid, 6)) # amp因子载荷精确到6位小数 assert len(apm_lst) == len(symbol_lst) # 2.2.2.构造APM因子字典,并持久化 date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_apm = { 'date': [date_label] * len(symbol_lst), 'id': symbol_lst, 'factorvalue': apm_lst } if save: Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_apm) # 2.3.构建PureAPM因子 # 将stat_arr转换为DataFrame, 此时的stat_arr已经经过了去极值和标准化处理 df_stat = DataFrame(stat_arr, index=symbol_lst, columns=['stat']) # 取得提纯的因变量因子 df_dependent_factor = cls.get_dependent_factors(calc_date) # 将df_stat和因变量因子拼接 df_data = pd.concat([df_stat, df_dependent_factor], axis=1, join='inner') # OLS回归,提纯APM因子 arr_data = np.array(df_data) pure_apm_model = sm.OLS(arr_data[:, 0], arr_data[:, 1:]) pure_apm_result = pure_apm_model.fit() pure_apm_lst = list(np.around(pure_apm_result.resid, 6)) pure_symbol_lst = list(df_data.index) assert len(pure_apm_lst) == len(pure_symbol_lst) # 构造pure_apm因子字典,并持久化 dict_pure_apm = { 'date': [date_label] * len(pure_symbol_lst), 'id': pure_symbol_lst, 'factorvalue': pure_apm_lst } pure_apm_db_file = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_apm_db_file) if save: Utils.factor_loading_persistent(pure_apm_db_file, calc_date.strftime('%Y%m%d'), dict_pure_apm) # 休息360秒 logging.info('Suspended for 360s.') time.sleep(360) return dict_apm
def get_dependent_factors(cls, date): """ 计算用于因子提纯的相关性因子值,包换行业、规模、价值、成长、短期动量、长期动量 Parameters: -------- :param date: datetime-like or str 日期 :return: pd.DataFrame index为个股代码, columns=[28个申万一级行业,规模(scale),价值(value),成长(growth),短期动量(short_momentum),长期动量(long_momentum)] """ str_date = Utils.to_date(date).strftime('%Y%m%d') # 1. 行业因子 # 1.1. 读取行业分类信息 df_industry_calssify = Utils.get_industry_classify() df_industry_calssify = df_industry_calssify.set_index('id') # 1.2. 构建行业分裂哑变量 df_industry_dummies = pd.get_dummies(df_industry_calssify['ind_code']) # 2. 规模因子 # 2.1. 读取规模因子 scale_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.SCALE_CT.db_file) df_scale_raw = Utils.read_factor_loading(scale_factor_path, str_date, nan_value=0) # 2.2. 规模因子去极值、标准化 scale_cleaned_arr = Utils.clean_extreme_value( np.array(df_scale_raw[['LnLiquidMktCap', 'LnTotalMktCap']])) scale_normalized_arr = Utils.normalize_data(scale_cleaned_arr) # 2.3. 规模因子降维 scale_factor_arr = np.mean(scale_normalized_arr, axis=1) scale_factor = Series(scale_factor_arr, index=df_scale_raw['id']) # 3. 价值因子 # 3.1. 读取价值因子 value_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.VALUE_CT.db_file) df_value_raw = Utils.read_factor_loading(value_factor_path, str_date, nan_value=0) # 3.2. 价值因子去极值、标准化 value_cleaned_arr = Utils.clean_extreme_value( np.array(df_value_raw[['ep_ttm', 'bp_lr', 'ocf_ttm']])) value_normalized_arr = Utils.normalize_data(value_cleaned_arr) # 3.3. 价值因子降维 value_factor_arr = np.mean(value_normalized_arr, axis=1) value_factor = Series(value_factor_arr, index=df_value_raw['id']) # 4. 成长因子 # 4.1. 读取成长因子 growth_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.GROWTH_CT.db_file) df_growth_raw = Utils.read_factor_loading(growth_factor_path, str_date, nan_value=0) # 4.2. 成长因子去极值、标准化 growth_cleaned_arr = Utils.clean_extreme_value( np.array(df_growth_raw[['npg_ttm', 'opg_ttm']])) growth_normalized_arr = Utils.normalize_data(growth_cleaned_arr) # 4.3. 成长因子降维 growth_factor_arr = np.mean(growth_normalized_arr, axis=1) growth_factor = Series(growth_factor_arr, index=df_growth_raw['id']) # 5. 动量因子 # 5.1. 读取动量因子 mom_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.MOMENTUM_CT.db_file) df_mom_raw = Utils.read_factor_loading(mom_factor_path, str_date, nan_value=0) # 5.2. 动量因子去极值、标准化 short_term_mom_header = [ 'short_term_' + d for d in factor_ct.MOMENTUM_CT.short_term_days.split('|') ] short_mom_cleaned_arr = Utils.clean_extreme_value( np.array(df_mom_raw[short_term_mom_header])) short_mom_normalized_arr = Utils.normalize_data(short_mom_cleaned_arr) long_term_mom_header = [ 'long_term_' + d for d in factor_ct.MOMENTUM_CT.long_term_days.split('|') ] long_mom_cleaned_arr = Utils.clean_extreme_value( np.array(df_mom_raw[long_term_mom_header])) long_mom_normalized_arr = Utils.normalize_data(long_mom_cleaned_arr) # 5.3. 动量因子降维 short_mom_arr = np.mean(short_mom_normalized_arr, axis=1) short_mom = Series(short_mom_arr, index=df_mom_raw['id']) long_mom_arr = np.mean(long_mom_normalized_arr, axis=1) long_mom = Series(long_mom_arr, index=df_mom_raw['id']) # 拼接除行业因子外的因子 df_style_factor = pd.concat( [scale_factor, value_factor, growth_factor, short_mom, long_mom], axis=1, keys=['scale', 'value', 'growth', 'short_mom', 'long_mom'], join='inner') # 再拼接行业因子 df_dependent_factor = pd.concat([df_industry_dummies, df_style_factor], axis=1, join='inner') return df_dependent_factor