def calc_factor_loading_(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算ResVolatility因子下各个成分因子的因子载荷 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 计算各成分因子的因子载荷 for com_factor in risk_ct.RESVOLATILITY_CT.component: factor = eval(com_factor + '()') factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) # 合成ResVolatility因子载荷 resvol_factor = pd.DataFrame() for com_factor in risk_ct.RESVOLATILITY_CT.component: factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, eval('risk_ct.' + com_factor + '_CT')['db_file']) factor_loading = Utils.read_factor_loading(factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) factor_loading.drop(columns='date', inplace=True) factor_loading[com_factor] = Utils.normalize_data(Utils.clean_extreme_value(np.array(factor_loading['factorvalue']).reshape((len(factor_loading), 1)))) factor_loading.drop(columns='factorvalue', inplace=True) if resvol_factor.empty: resvol_factor = factor_loading else: resvol_factor = pd.merge(left=resvol_factor, right=factor_loading, how='inner', on='id') resvol_factor.set_index('id', inplace=True) weight = pd.Series(risk_ct.RESVOLATILITY_CT.weight) resvol_factor = (resvol_factor * weight).sum(axis=1) resvol_factor.name = 'factorvalue' resvol_factor.index.name = 'id' resvol_factor = pd.DataFrame(resvol_factor) resvol_factor.reset_index(inplace=True) resvol_factor['date'] = Utils.get_trading_days(start=calc_date, ndays=2)[1] # 保存ResVolatility因子载荷 if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), resvol_factor.to_dict('list'),['date', 'id', 'factorvalue'])
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算NLSIZE因子载荷 dict_nlsize = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc NLSIZE factor loading.' % Utils.datetimelike_to_str(calc_date)) # 读取Size因子载荷数据 lncap_data_path = os.path.join(factor_ct.FACTOR_DB.db_path, '{}_{}.csv'.format(risk_ct.SIZE_CT.db_file, Utils.datetimelike_to_str(calc_date, dash=False))) if not os.path.exists(lncap_data_path): logging.info('[%s] 的Size因子载荷数据不存在.' % Utils.datetimelike_to_str(calc_date)) continue df_lncap = pd.read_csv(lncap_data_path, header=0) # Size因子数组 arr_size = np.array(df_lncap['factorvalue']) # Size因子三次方数组 arr_size_cube = arr_size ** 3 # 相对Size因子正交化 model = sm.OLS(arr_size_cube, arr_size) result = model.fit() # 对残差值进行缩尾处理和标准化 n = len(result.resid) arr_resid = result.resid # arr_resid = result.resid.reshape(n, 1) # arr_resid_winsorized = Utils.clean_extreme_value(arr_resid) # arr_resid_standardized = Utils.normalize_data(arr_resid_winsorized) # 保存NLSIZE因子载荷数据 dict_nlsize = dict({'date': df_lncap['date'].values, 'id': df_lncap['id'].values, 'factorvalue': arr_resid}) if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_nlsize, ['date', 'id', 'factorvalue']) return dict_nlsize
def _calc_alphafactor_loading(start_date, end_date=None, factor_name=None, multi_proc=False, test=False): """ 计算alpha因子因子载荷值(原始载荷值及去极值标准化后载荷值) Parameters: -------- :param start_date: datetime-like, str 开始日期, e.g: YYYY-MM-DD, YYYYMMDD :param end_date: datetime-like, str, 默认为None 结束日期, e.g: YYYY-MM-DD, YYYYMMDD :param factor_name: str, 默认为None alpha因子名称, e.g: SmartMoney factor_namea为None时, 计算所有alpha因子载荷值; 不为None时, 计算指定alpha因子的载荷值 :param multi_proc: bool, 默认为None 是否进行并行计算 :param test: bool, 默认为False 是否是进行因子检验 :return: 保存因子载荷值(原始载荷值及去极值标准化后的载荷值) """ # param_cons = eval('alphafactor_ct.'+factor_name.upper() + '_CT') start_date = Utils.to_date(start_date) if end_date is None: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) else: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) for calc_date in trading_days_series: if factor_name is None: for alphafactor_name in alphafactor_ct.ALPHA_FACTORS: CAlphaFactor = eval(alphafactor_name + '()') CAlphaFactor.calc_factor_loading(calc_date, month_end=True, save=True, multi_proc=multi_proc) else: if (not test) and (factor_name not in alphafactor_ct.ALPHA_FACTORS): raise ValueError("alpha因子类: %s, 不存在." % factor_name) CAlphaFactor = eval(factor_name + '()') CAlphaFactor.calc_factor_loading(calc_date, month_end=True, save=True, multi_proc=multi_proc)
def calc_future_ret(date, ndays): """ 计算date日期ndays个交易日前个股的未来1至ndays天的各个区间收益率数据 Parameters: -------- :param date: datetime-like, str 日期, e.g: YYYY-MM-DD, YYYYMMDD :param ndays: int 天数 :return: """ # 读取过去ndays+1个交易日序列 trading_days_series = Utils.get_trading_days(end=date, ndays=ndays+1) # 读取个股基本信息 stock_basics = Utils.get_stock_basics(trading_days_series[0]) # 从第2天开始遍历trading_days_series, 计算各个区间收益率数据 headers = ['code'] + ['day'+str(k) for k in range(1, ndays+1)] df_future_ret = pd.DataFrame(columns=headers) for _, stock_info in stock_basics.iterrows(): future_ret = pd.Series() future_ret['code'] = stock_info.symbol for k in range(1, ndays+1): future_ret['day'+str(k)] = Utils.calc_interval_ret(stock_info.symbol, start=trading_days_series[1], end=trading_days_series[k]) if future_ret['day'+str(k)] is None: future_ret['day'+str(k)] = np.nan else: future_ret['day' + str(k)] = round(future_ret['day' + str(k)], 6) df_future_ret = df_future_ret.append(future_ret, ignore_index=True) df_future_ret.dropna(axis=0, how='any', inplace=True) # 保存数据 cfg = ConfigParser() cfg.read('config.ini') future_ret_path = os.path.join(SETTINGS.FACTOR_DB_PATH, cfg.get('future_ret', 'ret_path'), '{}.csv'.format(Utils.datetimelike_to_str(trading_days_series[0], dash=False))) df_future_ret.to_csv(future_ret_path, index=False, encoding='utf-8')
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股的聪明钱因子载荷 :param code: 个股代码,如SH600000或600000 :param calc_date: 用于读取分钟行情的交易日期列表 :return: float 个股的SmartQ因子载荷值,无法计算返回None """ # 取得过去30天的交易日期 trading_days = Utils.get_trading_days(end=calc_date, ndays=30, ascending=False) # 取得过去self.__days天交易日的分钟行情数据 be_enough, df_min_mkt = Utils.get_min_mkts_fq(code, trading_days, cls.__days) # 计算SmartMoney因子载荷值 if be_enough: # 1.计算指标S_t = abs(R_t)/sqrt(V_t), R_t=第t分钟涨跌幅, V_t=第t分钟成交量 df_min_mkt['ind_s'] = df_min_mkt.apply( lambda x: abs(x.ret) * 10000 / math.sqrt(x.volume * 100.0) if x.volume > 0 else 0, axis=1) # 2.降序排列指标S df_min_mkt = df_min_mkt.sort_values(by='ind_s', ascending=False) # 3.计算累积成交量、累积成交金额 df_min_mkt['accum_volume'] = df_min_mkt['volume'].cumsum() df_min_mkt['accum_amount'] = df_min_mkt['amount'].cumsum() # 4.找到累积成交量占比前20%找到累积成交量占比前20%的交易,视为聪明钱(smart)交易, 那么聪明钱的情绪因子Q=VWAP_{smart}/VWAP_{all} # total_volume = df_min_mkt.iloc[df_min_mkt.shape[0]-1]['accum_volume'] * 100 # total_amount = df_min_mkt.iloc[df_min_mkt.shape[0]-1]['accum_amount'] # smart_volume = int(df_min_mkt.iloc[df_min_mkt.shape[0]-1]['accum_volume'] * 0.2) total_volume = df_min_mkt.iloc[-1].accum_volume * 100 total_amount = df_min_mkt.iloc[-1].accum_amount smart_volume = int(df_min_mkt.iloc[-1].accum_volume * 0.2) vwap_all = total_amount / total_volume # vwap_smart = 0.0 smart_mkt = df_min_mkt[ df_min_mkt.accum_volume > smart_volume].iloc[0] vwap_smart = smart_mkt.accum_amount / (smart_mkt.accum_volume * 100.0) # for row in df_min_mkt.itertuples(): # if row.accum_volume > smart_volume: # vwap_smart = row.accum_amount / (row.accum_volume*100.0) # break smart_q = round(vwap_smart / vwap_all, 6) else: smart_q = None # 返回个股的SmartMoney因子载荷值 return smart_q
def _calc_periodmomentum_ic(cls, calc_date, date_interval_type='month'): """ 计算日内各时段动量因子的Rank IC值向量 Parameters: -------- :param calc_date: datetime-like, str 计算日期, e.g: YYYY-MM-DD, YYYYMMDD :param date_interval_type: str 个股收益率计算的时间长度, 'month'=月度收益, 'day'=日收益 :return: pd.Series -------- IC值向量 0. date, 日期 1. IC0, 隔夜时段动量因子IC 2. IC1, 第1小时动量因子IC 3. IC2, 第2小时动量因子IC 4. IC3, 第3小时动量因子IC 5. IC4, 第4小时动量因子IC """ # 读取日内各时段动量因子载荷数据 df_period_mom = cls._get_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), factor_name='periodmomentum', factor_type='raw', drop_na=True) if df_period_mom.empty: return None if date_interval_type == 'month': # 读取个股下个月的月度收益率数据 ret_start, ret_end = Utils.next_month(calc_date) elif date_interval_type == 'day': ret_start = ret_end = Utils.get_trading_days(start=calc_date, ndays=2)[1] df_period_mom['ret'] = np.nan for idx, factorloading_data in df_period_mom.iterrows(): fret = Utils.calc_interval_ret(factorloading_data['id'], start=ret_start, end=ret_end) if fret is not None: df_period_mom.loc[idx, 'ret'] = fret df_period_mom.dropna(inplace=True) # 计算Rank IC值 df_period_mom.drop(columns=['date', 'id', 'm_normal'], inplace=True) df_spearman_corr = df_period_mom.corr(method='spearman') rank_IC = df_spearman_corr.loc['ret', ['m0', 'm1', 'm2', 'm3', 'm4']] rank_IC['date'] = calc_date # 保存Rank IC值 ic_filepath = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.INTRADAYMOMENTUM_CT['factor_ic_file']) Utils.save_timeseries_data(rank_IC, ic_filepath, save_type='a', columns=['date', 'm0', 'm1', 'm2', 'm3', 'm4']) return rank_IC
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷 :param save: 是否保存至因子数据库,默认为False :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0: ID, 证券ID,为索引 1: factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ # 0.取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 取得样本个股信息 all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算SMartQ因子载荷 dict_factor = None for calc_date in trading_days_series: dict_factor = {'id': [], 'factorvalue': []} if month_end and (not Utils.is_month_end(calc_date)): continue # 1.获取用于读取分钟行情的交易日列表(过去30天的交易日列表,降序排列) # trading_days = _get_trading_days(calc_date, 30) # trading_days = Utils.get_trading_days(end=calc_date, ndays=30, ascending=False) # 2.取得样本个股信息 # stock_basics = ts.get_stock_basics() s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 3.遍历样本个股代码,计算Smart_Q因子载荷值 dict_factor = {'id': [], 'factorvalue': []} # 采用单进程进行计算 # for _, stock_info in stock_basics.iterrows(): # # code = '%s%s' % ('SH' if code[:2] == '60' else 'SZ', code) # factor_loading = cls._calc_factor_loading(stock_info.symbol, calc_date) # print("[%s]Calculating %s's SmartMoney factor loading = %.4f." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, -1.0 if factor_loading is None else factor_loading)) # if factor_loading is not None: # # df_factor.ix[code, 'factorvalue'] = factor_loading # dict_factor['id'].append(Utils.code_to_symbol(stock_info.symbol)) # dict_factor['factorvalue'].append(factor_loading) # 采用多进程并行计算SmartQ因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷值 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): smart_q = q.get(True) dict_factor['id'].append(smart_q[0]) dict_factor['factorvalue'].append(smart_q[1]) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_factor['date'] = [date_label] * len(dict_factor['id']) # 4.保存因子载荷至因子数据库 if save: # db = shelve.open(cls._db_file, flag='c', protocol=None, writeback=False) # try: # db[calc_date.strftime('%Y%m%d')] = df_factor # finally: # db.close() Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_factor) # 休息300秒 logging.info('Suspending for 360s.') time.sleep(360) return dict_factor
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期,格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期,格式:YYYY-MM-DD or YYYYMMDD 如果为None,则只计算start_date日期的因子载荷 :param month_end:bool, 默认True 如果为True,则只结算月末时点的因子载荷 :param save: bool, 默认False 是否保存至因子数据库 :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date: 日期 1. id: 证券symbol 2. short_term_0: 第一个短期动量因子 3. short_term_1: 第二个短期动量因子 4. long_term_0: 第一个长期动量因子 5. long_term_1: 第二个长期动量因子 """ # 取得交易日序列及股票基本信息表 # start_date = Utils.to_date(start_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算动量因子 dict_momentum = None momentum_label = cls.momentum_label() for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_momentum = {'date': [], 'id': []} for label in momentum_label: dict_momentum[label] = [] # 遍历个股,计算个股动量因子 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 采用单进程进行计算 # for _, stock_info in stock_basics.iterrows(): # momentum_data = cls._calc_factor_loading(stock_info.symbol, calc_date) # if momentum_data is not None: # logging.info("[%s] calc %s's momentum factor loading." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) # dict_momentum['id'].append(Utils.code_to_symbol(stock_info.symbol)) # for label in momentum_label: # dict_momentum[label].append(momentum_data[label]) # 采用多进程并行计算动量因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): momentum_data = q.get(True) dict_momentum['id'].append(momentum_data['id']) for label in momentum_label: dict_momentum[label].append(momentum_data[label]) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_momentum['date'] = [date_label] * len(dict_momentum['id']) # 保存因子载荷至因子数据库 if save: Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_momentum) # 休息60秒 logging.info('Suspending for 60s.') time.sleep(60) return dict_momentum
def calc_factor_loading_(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算growth因子下各个成分因子的因子载荷 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 计算各成分因子的因子载荷 for com_factor in risk_ct.GROWTH_CT.component: factor = eval(com_factor + '()') factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) # 合成Growth因子载荷 growth_factor = pd.DataFrame() df_industry_classify = Utils.get_industry_classify() # 个股行业分类数据 for com_factor in risk_ct.GROWTH_CT.component: factor_path = os.path.join( factor_ct.FACTOR_DB.db_path, eval('risk_ct.' + com_factor + '_CT')['db_file']) factor_loading = Utils.read_factor_loading( factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) factor_loading.drop(columns='date', inplace=True) # factor_loading[com_factor] = Utils.normalize_data(Utils.clean_extreme_value(np.array(factor_loading['factorvalue']).reshape((len(factor_loading), 1)))) # factor_loading.drop(columns='factorvalue', inplace=True) factor_loading.rename(columns={'factorvalue': com_factor}, inplace=True) # 添加行业分类数据 factor_loading = pd.merge( left=factor_loading, right=df_industry_classify[['id', 'ind_code']], how='inner', on='id') # 取得含缺失值的因子载荷数据 missingdata_factor = factor_loading[ factor_loading[com_factor].isna()] # 删除factor_loading中的缺失值 factor_loading.dropna(axis='index', how='any', inplace=True) # 对factor_loading去极值、标准化 factor_loading = Utils.normalize_data(factor_loading, id='id', columns=com_factor, treat_outlier=True, weight='cap', calc_date=calc_date) # 把missingdata_factor中的缺失值替换为行业均值 ind_codes = set(missingdata_factor['ind_code']) ind_mean_factor = {} for ind_code in ind_codes: ind_mean_factor[ind_code] = factor_loading[ factor_loading['ind_code'] == ind_code][com_factor].mean() for idx, missingdata in missingdata_factor.iterrows(): missingdata_factor.loc[idx, com_factor] = ind_mean_factor[ missingdata['ind_code']] # 把missingdata_factor和factor_loading合并 factor_loading = pd.concat( [factor_loading, missingdata_factor]) # 删除ind_code列 factor_loading.drop(columns='ind_code', inplace=True) if growth_factor.empty: growth_factor = factor_loading else: growth_factor = pd.merge(left=growth_factor, right=factor_loading, how='inner', on='id') # # 读取个股行业分类数据, 添加至growth_factor中 # df_industry_classify = Utils.get_industry_classify() # growth_factor = pd.merge(left=growth_factor, right=df_industry_classify[['id', 'ind_code']]) # # 取得含缺失值的因子载荷数据 # missingdata_factor = growth_factor.loc[[ind for ind, data in growth_factor.iterrows() if data.hasnans]] # # 删除growth_factot中的缺失值 # growth_factor.dropna(axis='index', how='any', inplace=True) # # 对growth_factor去极值、标准化 # growth_factor = Utils.normalize_data(growth_factor, id='id', columns=risk_ct.GROWTH_CT.component, treat_outlier=True, weight='cap', calc_date=calc_date) # # 把missingdata_factor中的缺失值替换为行业均值 # ind_codes = set(missingdata_factor['ind_code']) # ind_mean_factor = {} # for ind_code in ind_codes: # ind_mean_factor[ind_code] = growth_factor[growth_factor['ind_code'] == ind_code].mean() # missingdata_label = {ind: missingdata_factor.columns[missingdata.isna()].tolist() for ind, missingdata in missingdata_factor.iterrows()} # for ind, cols in missingdata_label.items(): # missingdata_factor.loc[ind, cols] = ind_mean_factor[missingdata_factor.loc[ind, 'ind_code']][cols] # # 把missingdata_factor和growth_factor合并 # growth_factor = pd.concat([growth_factor, missingdata_factor]) # # 删除ind_code列 # growth_factor.drop(columns='ind_code', inplace=True) # 合成Growth因子 growth_factor.set_index('id', inplace=True) weight = pd.Series(risk_ct.GROWTH_CT.weight) growth_factor = (growth_factor * weight).sum(axis=1) growth_factor.name = 'factorvalue' growth_factor.index.name = 'id' growth_factor = pd.DataFrame(growth_factor) growth_factor.reset_index(inplace=True) growth_factor['date'] = Utils.get_trading_days(start=calc_date, ndays=2)[1] # 保存growth因子载荷 if save: Utils.factor_loading_persistent( cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), growth_factor.to_dict('list'), ['date', 'id', 'factorvalue'])
def smartq_backtest(start, end): """ SmartQ因子的历史回测 Parameters: -------- :param start: datetime-like, str 回测开始日期,格式:YYYY-MM-DD,开始日期应该为月初 :param end: datetime-like, str 回测结束日期,格式:YYYY-MM-DD :return: """ # 取得开始结束日期间的交易日序列 trading_days = Utils.get_trading_days(start, end) # 读取截止开始日期前最新的组合回测数据 prev_trading_day = Utils.get_prev_n_day(trading_days.iloc[0], 1) backtest_path = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.SMARTMONEY_CT.backtest_path) factor_data, port_nav = Utils.get_backtest_data(backtest_path, trading_days.iloc[0]) # factor_data = None # 记录每次调仓时最新入选个股的SmartQ因子信息,pd.DataFrame<date,factorvalue,id,buprice> if port_nav is None: port_nav = DataFrame({ 'date': [prev_trading_day.strftime('%Y-%m-%d')], 'nav': [1.0] }) # 遍历交易日,如果是月初,则读取SmartQ因子载荷值,进行调仓;如果不是月初,则进行组合估值 t = 0 # 记录调仓次数 for trading_day in trading_days: if factor_data is None: nav = port_nav[port_nav.date == prev_trading_day.strftime( '%Y-%m-%d')].iloc[0].nav else: nav = port_nav[port_nav.date == factor_data.iloc[0].date].iloc[0].nav interval_ret = 0.0 # 月初进行调仓 if Utils.is_month_start(trading_day): logging.info('[%s] 月初调仓.' % Utils.datetimelike_to_str(trading_day, True)) # 调仓前,先计算组合按均价卖出原先组合个股在当天的估值 if factor_data is not None: for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=True) if daily_mkt.date == trading_day.strftime('%Y-%m-%d'): vwap_price = daily_mkt.amount / daily_mkt.vol * daily_mkt.factor else: vwap_price = daily_mkt.close interval_ret += vwap_price / factor_info.buyprice - 1.0 interval_ret /= float(len(factor_data)) nav *= (1.0 + interval_ret) # 读取factor_data factor_data = Utils.read_factor_loading( SmartMoney.get_db_file(), Utils.datetimelike_to_str(prev_trading_day, False)) # 遍历factor_data, 计算每个个股过去20天的涨跌幅,并剔除在调仓日没有正常交易(如停牌)及涨停的个股 ind_to_be_deleted = [] factor_data['ret20'] = np.zeros(len(factor_data)) for ind, factor_info in factor_data.iterrows(): trading_status = Utils.trading_status(factor_info.id, trading_day) if trading_status == SecuTradingStatus.Suspend or trading_status == SecuTradingStatus.LimitUp: ind_to_be_deleted.append(ind) fret20 = Utils.calc_interval_ret(factor_info.id, end=prev_trading_day, ndays=20) if fret20 is None: if ind not in ind_to_be_deleted: ind_to_be_deleted.append(ind) else: factor_data.loc[ind, 'ret20'] = fret20 factor_data = factor_data.drop(ind_to_be_deleted, axis=0) # 对factor_data过去20天涨跌幅降序排列,剔除涨幅最大的20%个股 k = int(factor_data.shape[0] * 0.2) factor_data = factor_data.sort_values(by='ret20', ascending=False).iloc[k:] del factor_data['ret20'] # 删除ret20列 # 对factor_data按因子值升序排列,取前10%个股 factor_data = factor_data.sort_values(by='factorvalue', ascending=True) k = int(factor_data.shape[0] * 0.1) factor_data = factor_data.iloc[:k] # 遍历factor_data,添加买入价格,并估值计算当天调仓后的组合收益 factor_data['buyprice'] = 0.0 interval_ret = 0.0 for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=False) assert len(daily_mkt) > 0 factor_data.loc[ ind, 'buyprice'] = daily_mkt.amount / daily_mkt.vol * daily_mkt.factor interval_ret += daily_mkt.close / factor_data.loc[ ind, 'buyprice'] - 1.0 interval_ret /= float(factor_data.shape[0]) nav *= (1.0 + interval_ret) # 保存factor_data port_data_path = os.path.join( SETTINGS.FACTOR_DB_PATH, alphafactor_ct.SMARTMONEY_CT.backtest_path, 'port_data_%s.csv' % Utils.datetimelike_to_str(trading_day, False)) factor_data.to_csv(port_data_path, index=False) t += 1 if t % 6 == 0: logging.info('Suspended for 300s.') time.sleep(300) else: # 非调仓日,对组合进行估值 logging.info('[%s] 月中估值.' % Utils.datetimelike_to_str(trading_day, True)) if factor_data is not None: for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=True) interval_ret += daily_mkt.close / factor_info.buyprice - 1.0 interval_ret /= float(factor_data.shape[0]) nav *= (1.0 + interval_ret) # 添加nav port_nav = port_nav.append(Series({ 'date': Utils.datetimelike_to_str(trading_day, True), 'nav': nav }), ignore_index=True) # 设置prev_trading_day prev_trading_day = trading_day # 保存port_nav port_nav_path = os.path.join(SETTINGS.FACTOR_DB_PATH, alphafactor_ct.SMARTMONEY_CT.backtest_path, 'port_nav.csv') port_nav.to_csv(port_nav_path, index=False)
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期,格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期,如果为None,则只计算start_date日期的因子载荷,格式:YYYY-MM-DD or YYYYMMDD :param month_end: bool,默认True 如果为True,则只计算月末时点的因子载荷 :param save: bool,默认False 是否保存至因子数据库 :param kwargs['synthetic_factor']: bool, 默认为False 是否计算合成因子 :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date: 日期 1. id: 证券symbol 2. m0: 隔夜时段动量 3. m1: 第一个小时动量 4. m2: 第二个小时动量 5. m3: 第三个小时动量 6. m4: 第四个小时动量 7. m_normal: 传统动量 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算日内动量因子值 dict_intraday_momentum = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info( '[%s] calc synthetic intraday momentum factor loading.' % Utils.datetimelike_to_str(calc_date)) if 'synthetic_factor' in kwargs and kwargs[ 'synthetic_factor']: # 计算日内合成动量因子 dict_intraday_momentum = { 'date': [], 'id': [], 'factorvalue': [] } # 读取日内个时段动量因子值 df_factor_loading = Utils.read_factor_loading( cls._db_file, Utils.datetimelike_to_str(calc_date, False)) if df_factor_loading.shape[0] <= 0: logging.info( "[%s] It doesn't exist intraday momentum factor loading." % Utils.datetimelike_to_str(calc_date)) return df_factor_loading.fillna(0, inplace=True) # 读取因子最优权重 factor_weight = cls.get_factor_weight(calc_date) if factor_weight is None: logging.info("[%s] It doesn't exist factor weight.") return # 计算合成动量因子 arr_factor_loading = np.array( df_factor_loading[['m0', 'm1', 'm2', 'm3', 'm4']]) arr_factor_weight = np.array( factor_weight.drop('date')).reshape((5, 1)) arr_synthetic_factor = np.dot(arr_factor_loading, arr_factor_weight) # arr_synthetic_factor = np.around(arr_synthetic_factor, 6) dict_intraday_momentum['date'] = list( df_factor_loading['date']) dict_intraday_momentum['id'] = list(df_factor_loading['id']) dict_intraday_momentum['factorvalue'] = list( arr_synthetic_factor.astype(float).round(6).reshape( (arr_synthetic_factor.shape[0], ))) # 保存合成因子 if save: synthetic_db_file = os.path.join( factor_ct.FACTOR_DB.db_path, factor_ct.INTRADAYMOMENTUM_CT.synthetic_db_file) Utils.factor_loading_persistent( synthetic_db_file, Utils.datetimelike_to_str(calc_date, False), dict_intraday_momentum) else: # 计算日内各时段动量因子 dict_intraday_momentum = { 'date': [], 'id': [], 'm0': [], 'm1': [], 'm2': [], 'm3': [], 'm4': [], 'm_normal': [] } # 遍历个股,计算个股日内动量值 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 采用单进程进行计算 # for _, stock_info in stock_basics.iterrows(): # momentum_data = cls._calc_factor_loading(stock_info.symbol, calc_date) # if momentum_data is not None: # logging.info("[%s] %s's intraday momentum = (%0.4f,%0.4f,%0.4f,%0.4f,%0.4f,%0.4f)" % (calc_date.strftime('%Y-%m-%d'),stock_info.symbol, momentum_data.m0, momentum_data.m1, momentum_data.m2, momentum_data.m3, momentum_data.m4, momentum_data.m_normal)) # dict_intraday_momentum['id'].append(Utils.code_to_symbol(stock_info.symbol)) # dict_intraday_momentum['m0'].append(round(momentum_data.m0, 6)) # dict_intraday_momentum['m1'].append(round(momentum_data.m1, 6)) # dict_intraday_momentum['m2'].append(round(momentum_data.m2, 6)) # dict_intraday_momentum['m3'].append(round(momentum_data.m3, 6)) # dict_intraday_momentum['m4'].append(round(momentum_data.m4, 6)) # dict_intraday_momentum['m_normal'].append(round(momentum_data.m_normal, 6)) # 采用多进程并行计算日内动量因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): momentum_data = q.get(True) dict_intraday_momentum['id'].append(momentum_data[0]) dict_intraday_momentum['m0'].append( round(momentum_data[1], 6)) dict_intraday_momentum['m1'].append( round(momentum_data[2], 6)) dict_intraday_momentum['m2'].append( round(momentum_data[3], 6)) dict_intraday_momentum['m3'].append( round(momentum_data[4], 6)) dict_intraday_momentum['m4'].append( round(momentum_data[5], 6)) dict_intraday_momentum['m_normal'].append( round(momentum_data[6], 6)) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_intraday_momentum['date'] = [date_label] * len( dict_intraday_momentum['id']) # 保存因子载荷至因子数据库 if save: Utils.factor_loading_persistent( cls._db_file, calc_date.strftime('%Y%m%d'), dict_intraday_momentum) # 休息360秒 logging.info('Suspending for 360s.') time.sleep(360) return dict_intraday_momentum
def _calc_factor_loading(cls, code, calc_date): """ Parameter: -------- :param code: str 个股代码, 如SH600000, 600000 :param calc_date: datetime-like, str 计算日期, 格式: YYYY-MM-DD :return: pd.Series -------- 个股的CMRA因子载荷 0. code 1. cmra 如果计算失败, 返回None """ # 取得个股日复权行情数据 # df_secu_quote = Utils.get_secu_daily_mkt(code, end=calc_date,ndays=risk_ct.CMRA_CT.trailing*risk_ct.CMRA_CT.days_scale+1, fq=True) # if df_secu_quote is None: # return None # if len(df_secu_quote) < risk_ct.CMRA_CT.listed_days: # return None # df_secu_quote.reset_index(drop=True, inplace=True) # 计算个股的日对数收益率序列 # arr_secu_close = np.array(df_secu_quote.iloc[1:]['close']) # arr_secu_preclose = np.array(df_secu_quote.shift(1).iloc[1:]['close']) # arr_secu_daily_ret = np.log(arr_secu_close / arr_secu_preclose) # 每个月计算累积收益率 # z = [] # for t in range(1, risk_ct.CMRA_CT.trailing+1): # k = t * risk_ct.CMRA_CT.days_scale - 1 # if k > len(arr_secu_daily_ret) - 1: # k = len(arr_secu_daily_ret) - 1 # z.append(np.sum(arr_secu_daily_ret[:k])) # break # else: # z.append(np.sum(arr_secu_daily_ret[:k])) # 计算每个月的个股价格变化率(1+r) # z = [] # for t in range(1, risk_ct.CMRA_CT.trailing+1): # k = t * risk_ct.CMRA_CT.days_scale # if k > len(df_secu_quote)-1: # k = len(df_secu_quote)-1 # z.append(df_secu_quote.iloc[k]['close']/df_secu_quote.iloc[0]['close']) # break # else: # z.append(df_secu_quote.iloc[k]['close']/df_secu_quote.iloc[0]['close']) # cmra = np.log(max(z)) - np.log(min(z)) # 取得交易日序列 trading_days = Utils.get_trading_days(end=calc_date, ndays=risk_ct.CMRA_CT.trailing*risk_ct.CMRA_CT.days_scale+1) trading_days = [day.strftime('%Y-%m-%d') for day in trading_days] # 取得个股复权行情数据 df_secu_quote = Utils.get_secu_daily_mkt(code, end=calc_date, fq=True) # 提取相应交易日的个股复权行情数据 df_secu_quote = df_secu_quote[df_secu_quote['date'].isin(trading_days)] df_secu_quote.reset_index(drop=True, inplace=True) # 计算个股每个月的个股价格变化率 z = [] if len(df_secu_quote) < int(risk_ct.CMRA_CT.trailing*risk_ct.CMRA_CT.days_scale/2): # 如果提取的个股复权行情长度小于所需时间长度的一半(126个交易日), 返回None return None else: prev_trading_day = df_secu_quote.iloc[0]['date'] for t in range(1, risk_ct.CMRA_CT.trailing+1): k = t * risk_ct.CMRA_CT.days_scale trading_day = trading_days[k] if trading_day < df_secu_quote.iloc[0]['date']: continue # try: secu_trading_day = df_secu_quote[df_secu_quote['date'] <= trading_day].iloc[-1]['date'] if secu_trading_day <= prev_trading_day: continue else: ret = df_secu_quote[df_secu_quote['date']==secu_trading_day].iloc[0]['close']/df_secu_quote.iloc[0]['close'] z.append(ret) prev_trading_day = secu_trading_day # except Exception as e: # print(e) cmra = math.log(max(z)) - math.log(min(z)) return pd.Series([Utils.code_to_symbol(code), cmra], index=['code', 'cmra'])
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算DASTD因子载荷 dict_dastd = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc DASTD factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股, 计算个股的DASTD因子值 s = (calc_date - datetime.timedelta(days=risk_ct.DASTD_CT.listed_days)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] ids = [] # 个股代码list dastds = [] # DASTD因子值list if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程计算DASTD因子值 for _, stock_info in stock_basics.iterrows(): logging.info("[%s] Calc %s's DASTD factor loading." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) dastd_data = cls._calc_factor_loading(stock_info.symbol, calc_date) if dastd_data is None: ids.append(Utils.code_to_symbol(stock_info.symbol)) dastds.append(np.nan) else: ids.append(dastd_data['code']) dastds.append(dastd_data['dastd']) else: # 采用多进程并行计算DASTD因子值 q = Manager().Queue() # 队列, 用于进程间通信, 存储每个进程计算的因子载荷 p = Pool(4) # 进程池, 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=(stock_info.symbol, calc_date, q,)) p.close() p.join() while not q.empty(): dastd_data = q.get(True) ids.append(dastd_data['code']) dastds.append(dastd_data['dastd']) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_dastd = {'date': [date_label]*len(ids), 'id': ids, 'factorvalue': dastds} if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_dastd, ['date', 'id', 'factorvalue']) # 暂停180秒 logging.info('Suspending for 180s.') # time.sleep(180) return dict_dastd
def apm_backtest(start, end, pure_factor=False): """ APM因子的历史回测 Parameters: -------- :param start: datetime-like, str 回测开始日期,格式:YYYY-MM-DD,开始日期应该为月初的前一个交易日,即月末交易日 :param end: datetime-like, str 回测结束日期,格式:YYYY-MM-DD :param pure_factor: bool, 默认False 是否是对纯净因子做回测 :return: """ # 取得开始结束日期间的交易日数据 trading_days = Utils.get_trading_days(start, end) # 读取截止开始日期前最新的组合回测数据 prev_trading_day = Utils.get_prev_n_day(trading_days.iloc[0], 1) if pure_factor: backtest_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_backtest_path) else: backtest_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.backtest_path) factor_data, port_nav = Utils.get_backtest_data(backtest_path, trading_days.iloc[0]) # factor_data = None # 记录每次调仓时最新入选个股的APM因子信息,pd.DataFrame<date,factorvalue,id,buyprice> if port_nav is None: port_nav = DataFrame({ 'date': [prev_trading_day.strftime('%Y-%m-%d')], 'nav': [1.0] }) # 遍历交易日,如果是月初,则读取APM因子载荷值;如果不是月初,则进行组合估值 for trading_day in trading_days: if factor_data is None: nav = port_nav[port_nav.date == prev_trading_day.strftime( '%Y-%m-%d')].iloc[0].nav else: nav = port_nav[port_nav.date == factor_data.iloc[0].date].iloc[0].nav interval_ret = 0.0 # 月初进行调仓 if Utils.is_month_start(trading_day): logging.info('[%s] 月初调仓.' % Utils.datetimelike_to_str(trading_day, True)) # 调仓前,先估值计算按均价卖出原先组合个股在当天的估值 if factor_data is not None: for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=True) if daily_mkt.date == trading_day.strftime('%Y-%m-%d'): vwap_price = daily_mkt.amount / daily_mkt.vol * daily_mkt.factor else: vwap_price = daily_mkt.close interval_ret += vwap_price / factor_info.buyprice - 1.0 interval_ret /= float(len(factor_data)) nav *= (1.0 + interval_ret) # 读取factor_data if pure_factor: factor_data_path = os.path.join( factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_apm_db_file) else: factor_data_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.apm_db_file) factor_data = Utils.read_factor_loading( factor_data_path, Utils.datetimelike_to_str(prev_trading_day, False)) # 遍历factor_data,剔除在调仓日没有正常交易(如停牌)、及涨停的个股 ind_to_be_delted = [] for ind, factor_info in factor_data.iterrows(): trading_status = Utils.trading_status(factor_info.id, trading_day) if trading_status == SecuTradingStatus.Suspend or trading_status == SecuTradingStatus.LimitUp: ind_to_be_delted.append(ind) factor_data = factor_data.drop(ind_to_be_delted, axis=0) # 对factor_data按因子值降序排列,取前10%个股 factor_data = factor_data.sort_values(by='factorvalue', ascending=False) factor_data = factor_data.iloc[:int(len(factor_data) * 0.1)] # 遍历factor_data,添加买入价格,并估值计算当天调仓后的组合收益 factor_data['buyprice'] = 0.0 interval_ret = 0.0 for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=False) assert len(daily_mkt) > 0 factor_data.loc[ ind, 'buyprice'] = daily_mkt.amount / daily_mkt.vol * daily_mkt.factor interval_ret += daily_mkt.close / factor_data.loc[ ind, 'buyprice'] - 1.0 interval_ret /= float(len(factor_data)) nav *= (1.0 + interval_ret) # 保存factor_data if pure_factor: port_data_path = os.path.join( factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_backtest_path, 'port_data_%s.csv' % Utils.datetimelike_to_str(trading_day, False)) else: port_data_path = os.path.join( factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.backtest_path, 'port_data_%s.csv' % Utils.datetimelike_to_str(trading_day, False)) factor_data.to_csv(port_data_path, index=False) else: # 非调仓日,对组合进行估值 logging.info('[%s] 月中估值.' % Utils.datetimelike_to_str(trading_day, True)) if factor_data is not None: for ind, factor_info in factor_data.iterrows(): daily_mkt = Utils.get_secu_daily_mkt(factor_info.id, trading_day, fq=True, range_lookup=True) interval_ret += daily_mkt.close / factor_info.buyprice - 1.0 interval_ret /= float(len(factor_data)) nav *= (1.0 + interval_ret) # 添加nav port_nav = port_nav.append(Series({ 'date': trading_day.strftime('%Y-%m-%d'), 'nav': nav }), ignore_index=True) # 设置prev_trading_day prev_trading_day = trading_day # 保存port_nav if pure_factor: port_nav_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_backtest_path, 'port_nav.csv') else: port_nav_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.backtest_path, 'port_nav.csv') port_nav.to_csv(port_nav_path, index=False)
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股APM因子的stat统计量 -------- :param code: string 个股代码,如600000 :param calc_date: datetime-like, str 因子载荷计算日期,格式YYYY-MM-DD :return: float -------- stat统计量,计算APM因子载荷的中间变量 """ # 1.取得过去40个交易日序列,交易日按降序排列 calc_date = Utils.to_date(calc_date) trading_days = Utils.get_trading_days(end=calc_date, ndays=40, ascending=False) # 2.取得个股及指数过去__days+1个交易日每个交易日的开盘价、中午收盘价和当天收盘价 # 开盘价为09:31分钟线的开盘价,中午收盘价为11:30分钟线的收盘价,当天收盘价为15:00分钟线的收盘价 # 返回的数据格式为DataFrame,columns=['date','open','mid_close','close'],按日期升序排列 # secu_mkt_data = DataFrame() # index_mkt_data = DataFrame() # mkt_data_header = ['date', 'open', 'mid_close', 'close'] # k = 0 # for trading_day in trading_days: # df_1min_data = Utils.get_min_mkt(Utils.code_to_symbol(code), trading_day, fq=True) # if df_1min_data is not None: # str_date = Utils.datetimelike_to_str(trading_day) # fopen = df_1min_data[df_1min_data.datetime == '%s 09:31:00' % str_date].iloc[0].open # fmid_close = df_1min_data[df_1min_data.datetime == '%s 11:30:00' % str_date].iloc[0].close # fclose = df_1min_data[df_1min_data.datetime == '%s 15:00:00' % str_date].iloc[0].close # secu_mkt_data = secu_mkt_data.append( # Series([str_date, fopen, fmid_close, fclose], index=mkt_data_header), ignore_index=True) # # df_1min_data = Utils.get_min_mkt(factor_ct.APM_CT.index_code, trading_day, index=True, fq=True) # fopen = df_1min_data[df_1min_data.datetime == '%s 09:31:00' % str_date].iloc[0].open # fmid_close = df_1min_data[df_1min_data.datetime == '%s 11:30:00' % str_date].iloc[0].close # fclose = df_1min_data[df_1min_data.datetime == '%s 15:00:00' % str_date].iloc[0].close # index_mkt_data = index_mkt_data.append( # Series([str_date, fopen, fmid_close, fclose], index=mkt_data_header), ignore_index=True) # k += 1 # if k > cls.__days: # break # if k <= cls.__days: # return None # secu_mkt_data = secu_mkt_data.sort_values(by='date') # secu_mkt_data = secu_mkt_data.reset_index(drop=True) # index_mkt_data = index_mkt_data.sort_values(by='date') # index_mkt_data = index_mkt_data.reset_index(drop=True) # # 3.计算个股及指数的上午收益率数组r_t^{am},R_t^{am}和下午收益率数组r_t^{pm},R_t^{pm},并拼接为一个数组 # # 拼接后的收益率数组,上半部分为r_t^{am} or R_t^{am},下半部分为r_t^{pm} or R_t^{pm} # r_am_array = np.zeros((cls.__days, 1)) # r_pm_array = np.zeros((cls.__days, 1)) # for ind in secu_mkt_data.index[1:]: # r_am_array[ind-1, 0] = secu_mkt_data.loc[ind, 'mid_close'] / secu_mkt_data.loc[ind-1, 'close'] - 1.0 # r_pm_array[ind-1, 0] = secu_mkt_data.loc[ind, 'close'] / secu_mkt_data.loc[ind, 'mid_close'] - 1.0 # r_apm_array = np.concatenate((r_am_array, r_pm_array), axis=0) # # R_am_array = np.zeros((cls.__days, 1)) # R_pm_array = np.zeros((cls.__days, 1)) # for ind in index_mkt_data.index[1:]: # R_am_array[ind-1, 0] = index_mkt_data.loc[ind, 'mid_close'] / index_mkt_data.loc[ind-1, 'close'] - 1.0 # R_pm_array[ind-1, 0] = index_mkt_data.loc[ind, 'close'] / index_mkt_data.loc[ind, 'mid_close'] - 1.0 # R_apm_array = np.concatenate((R_am_array, R_pm_array), axis=0) # 遍历交易日序列,计算个股及指数的上午收益率(r_am_array,R_am_array)和下午收益率序列(r_pm_array,R_pm_array) r_am_array = np.zeros((cls.__days, 1)) r_pm_array = np.zeros((cls.__days, 1)) R_am_array = np.zeros((cls.__days, 1)) R_pm_array = np.zeros((cls.__days, 1)) k = 0 for trading_day in trading_days: df_1min_data = Utils.get_min_mkt(Utils.code_to_symbol(code), trading_day, fq=True) if df_1min_data is not None: str_date = Utils.datetimelike_to_str(trading_day) fopen = df_1min_data[df_1min_data.datetime == '%s 09:31:00' % str_date].iloc[0].open fmid_close = df_1min_data[df_1min_data.datetime == '%s 11:30:00' % str_date].iloc[0].close fclose = df_1min_data[df_1min_data.datetime == '%s 15:00:00' % str_date].iloc[0].close r_am_array[k, 0] = fmid_close / fopen - 1.0 r_pm_array[k, 0] = fclose / fmid_close - 1.0 df_1min_data = Utils.get_min_mkt(factor_ct.APM_CT.index_code, trading_day, index=True, fq=True) fopen = df_1min_data[df_1min_data.datetime == '%s 09:31:00' % str_date].iloc[0].open fmid_close = df_1min_data[df_1min_data.datetime == '%s 11:30:00' % str_date].iloc[0].close fclose = df_1min_data[df_1min_data.datetime == '%s 15:00:00' % str_date].iloc[0].close R_am_array[k, 0] = fmid_close / fopen - 1.0 R_pm_array[k, 0] = fclose / fmid_close - 1.0 k += 1 if k == cls.__days: break if k < cls.__days: return None r_apm_array = np.concatenate((r_am_array, r_pm_array), axis=0) R_apm_array = np.concatenate((R_am_array, R_pm_array), axis=0) # 4.个股收益率数组相对于指数收益率进行线性回归 # 将指数收益率数组添加常数项 R_apm_array = sm.add_constant(R_apm_array) # 线性回归:r_i = \alpha + \beta * R_i + \epsilon_i stat_model = sm.OLS(r_apm_array, R_apm_array) stat_result = stat_model.fit() resid_array = stat_result.resid.reshape((cls.__days * 2, 1)) # 回归残差数组 # 5.计算stat统计量 # 以上得到的__days*2个残差\epsilon_i中,属于上午的记为\epsilon_i^{am},属于下午的记为\epsilong_i^{pm},计算每日上午与 # 下午残差的差值:$\sigma_t = \spsilon_i^{am} - \epsilon_i^{pm}$,为了衡量上午与下午残差的差异程度,设计统计量: # $stat = \frac{\mu(\sigma_t)}{\delta(\sigma_t)\sqrt(N)}$,其中\mu为均值,\sigma为标准差,N=__days,总的来说 # 统计量stat反映了剔除市场影响后股价行为上午与下午的差异程度。stat数值大(小)于0越多,则股票在上午的表现越好(差)于下午。 delta_array = resid_array[:cls.__days] - resid_array[ cls.__days:] # 上午与 下午的残差差值 delta_avg = np.mean(delta_array) # 残差差值的均值 delta_std = np.std(delta_array) # 残差差值的标准差 # 如果残差差值的标准差接近于0,返回None if np.fabs(delta_std) < 0.0001: return None stat = delta_avg / delta_std / np.sqrt(cls.__days) # logging.info('%s, stat = %.6f' % (code, stat)) return stat
def calc_factorloading(self, start_date, end_date=None): """ 计算风险因子的因子载荷 Parameters: -------- :param start_date: datetime-like, str 计算开始日期, 格式: YYYY-MM-DD :param end_date: datetime-like, str 计算结束日期, 格式: YYYY-MM-DD :return: None """ # 读取交易日序列 start_date = Utils.to_date(start_date) if not end_date is None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(start=start_date, ndays=1) # 遍历交易日序列, 计算风险因子的因子载荷 for calc_date in trading_days_series: Size.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) Beta.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) Momentum.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) ResVolatility.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) NonlinearSize.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) Value.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) Liquidity.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) EarningsYield.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) Growth.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True) Leverage.calc_factor_loading(start_date=start_date, end_date=None, month_end=False, save=True, multi_proc=True)
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷,该参数只在end_date不为None时有效,并且不论end_date是否为None,都会计算第一天的因子载荷 :param save: 是否保存至因子数据库,默认为False :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0: id, 证券ID 1: factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ # 1.取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 2.遍历交易日序列,计算APM因子载荷 dict_apm = None for calc_date in trading_days_series: dict_apm = {'date': [], 'id': [], 'factorvalue': []} if month_end and (not Utils.is_month_end(calc_date)): continue # 2.1.遍历个股,计算个股APM.stat统计量,过去20日收益率,分别放进stat_lst,ret20_lst列表中 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] stat_lst = [] ret20_lst = [] symbol_lst = [] # 采用单进程计算 # for _, stock_info in stock_basics.iterrows(): # stat_i = cls._calc_factor_loading(stock_info.symbol, calc_date) # ret20_i = Utils.calc_interval_ret(stock_info.symbol, end=calc_date, ndays=20) # if stat_i is not None and ret20_i is not None: # stat_lst.append(stat_i) # ret20_lst.append(ret20_i) # symbol_lst.append(Utils.code_to_symbol(stock_info.symbol)) # logging.info('APM of %s = %f' % (stock_info.symbol, stat_i)) # 采用多进程并行计算 q = Manager().Queue() p = Pool(4) # 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): apm_value = q.get(True) symbol_lst.append(apm_value[0]) stat_lst.append(apm_value[1]) ret20_lst.append(apm_value[2]) assert len(stat_lst) == len(ret20_lst) assert len(stat_lst) == len(symbol_lst) # 2.2.构建APM因子 # 2.2.1.将统计量stat对动量因子ret20j进行截面回归:stat_j = \beta * Ret20_j + \epsilon_j # 残差向量即为对应个股的APM因子 # 截面回归之前,先对stat统计量和动量因子进行去极值和标准化处理 stat_arr = np.array(stat_lst).reshape((len(stat_lst), 1)) ret20_arr = np.array(ret20_lst).reshape((len(ret20_lst), 1)) stat_arr = Utils.clean_extreme_value(stat_arr) stat_arr = Utils.normalize_data(stat_arr) ret20_arr = Utils.clean_extreme_value(ret20_arr) ret20_arr = Utils.normalize_data(ret20_arr) # 回归分析 # ret20_arr = sm.add_constant(ret20_arr) apm_model = sm.OLS(stat_arr, ret20_arr) apm_result = apm_model.fit() apm_lst = list(np.around(apm_result.resid, 6)) # amp因子载荷精确到6位小数 assert len(apm_lst) == len(symbol_lst) # 2.2.2.构造APM因子字典,并持久化 date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_apm = { 'date': [date_label] * len(symbol_lst), 'id': symbol_lst, 'factorvalue': apm_lst } if save: Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_apm) # 2.3.构建PureAPM因子 # 将stat_arr转换为DataFrame, 此时的stat_arr已经经过了去极值和标准化处理 df_stat = DataFrame(stat_arr, index=symbol_lst, columns=['stat']) # 取得提纯的因变量因子 df_dependent_factor = cls.get_dependent_factors(calc_date) # 将df_stat和因变量因子拼接 df_data = pd.concat([df_stat, df_dependent_factor], axis=1, join='inner') # OLS回归,提纯APM因子 arr_data = np.array(df_data) pure_apm_model = sm.OLS(arr_data[:, 0], arr_data[:, 1:]) pure_apm_result = pure_apm_model.fit() pure_apm_lst = list(np.around(pure_apm_result.resid, 6)) pure_symbol_lst = list(df_data.index) assert len(pure_apm_lst) == len(pure_symbol_lst) # 构造pure_apm因子字典,并持久化 dict_pure_apm = { 'date': [date_label] * len(pure_symbol_lst), 'id': pure_symbol_lst, 'factorvalue': pure_apm_lst } pure_apm_db_file = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.APM_CT.pure_apm_db_file) if save: Utils.factor_loading_persistent(pure_apm_db_file, calc_date.strftime('%Y%m%d'), dict_pure_apm) # 休息360秒 logging.info('Suspended for 360s.') time.sleep(360) return dict_apm
def _calc_mvpfp_performance(factor_name, start_date, end_date): """ 计算最小波动纯因子组合的绩效 Parameters: -------- :param factor_name: str 因子名称, e.g: SmartMoney :param start_date: datetime-like, str 开始日期, e.g: YYYY-MM-DD, YYYYMMDD :param end_date: datetime-like, str 结束日期, e.g: YYYY-MM-DD, YYYYMMDD :return: """ start_date = Utils.to_date(start_date) end_date = Utils.to_date(end_date) # 读取mvpfp组合持仓数据, 构建Portfolio mvpfp_path = os.path.join( SETTINGS.FACTOR_DB_PATH, eval('alphafactor_ct.' + factor_name.upper() + '.CT')['db_file'], 'mvpfp') if not os.path.isdir(mvpfp_path): raise NotADirectoryError("%s因子的mvpfp组合文件夹不存在.") mvpfp_port = CPortfolio('weight_holding') for mvpfp_filename in os.listdir(mvpfp_path): if os.path.splitext(mvpfp_filename)[1] != '.csv': continue mvpfp_date = Utils.to_date(mvpfp_filename.split('.')[0]) if mvpfp_date < start_date or mvpfp_date > end_date: continue mvpfp_filepath = os.path.join(mvpfp_path, mvpfp_filename) mvpfp_port.load_holdings_fromfile(mvpfp_filepath) # 遍历持仓数据, 计算组合绩效 df_daily_performance = pd.DataFrame( columns=alphamodel_ct.FACTOR_PERFORMANCE_HEADER['daily_performance'] ) # 日度绩效 df_monthly_performance = pd.DataFrame( columns=alphamodel_ct.FACTOR_PERFORMANCE_HEADER['monthly_performance'] ) # 月度绩效 df_daily_performance.loc[0, 'daily_ret'] = 0.0 df_daily_performance.loc[0, 'nav'] = 1.0 df_daily_performance.loc[0, 'accu_ret'] = 0.0 mvpfp_holdings = mvpfp_port.holdings prev_holdingdate = curr_holding_date = None prevmonth_idx = 0 holding_dates = list(mvpfp_holdings.keys()) df_daily_performance.loc[0, 'date'] = holding_dates[0] if end_date > holding_dates[-1]: holding_dates += [end_date] mvpfp_daily_performance = pd.Series( index=alphamodel_ct.FACTOR_PERFORMANCE_HEADER['daily_performance']) mvpfp_monthly_performance = pd.Series( index=alphamodel_ct.FACTOR_PERFORMANCE_HEADER['monthly_performance']) for holding_date in holding_dates: prev_holdingdate = curr_holding_date curr_holding_date = holding_date if prev_holdingdate is None: continue prevmonth_idx = df_daily_performance.index[-1] holding_data = mvpfp_holdings[prev_holdingdate] trading_days_series = Utils.get_trading_days( start=prev_holdingdate + datetime.timedelta(days=1), end=curr_holding_date) for calc_date in trading_days_series: mvpfp_daily_performance['date'] = calc_date daily_ret = 0 # TODO 增加并行计算个股绩效的功能 for _, holding in holding_data.holding.iterrows(): ret = Utils.calc_interval_ret(holding['code'], start=trading_days_series[0], end=calc_date) if ret is not None: daily_ret += ret * holding['weight'] mvpfp_daily_performance['daily_ret'] = daily_ret mvpfp_daily_performance[ 'nav'] = df_daily_performance.iloc[-1]['nav'] * (1 + daily_ret) mvpfp_daily_performance[ 'accu_ret'] = mvpfp_daily_performance['nav'] - 1 df_daily_performance = df_daily_performance.append( mvpfp_daily_performance, ignore_index=True) mvpfp_monthly_performance['date'] = curr_holding_date mvpfp_monthly_performance['monthly_ret'] = df_daily_performance.iloc[ -1]['nav'] / df_daily_performance.loc[prevmonth_idx, 'nav'] - 1.0 df_monthly_performance = df_monthly_performance.append( mvpfp_monthly_performance, ignore_index=True) # for k in range(1, len(df_daily_performance)): # df_daily_performance.loc[k, 'nav'] = df_daily_performance.loc[k-1, 'nav'] * (1 + df_daily_performance.loc[k, 'daily_ret']) # df_daily_performance.loc[k, 'accu_ret'] = df_daily_performance.loc[k, 'nav'] - 1 # 保存数据 _save_mvpfp_performance(df_daily_performance, factor_name, 'daily', 'a') _save_mvpfp_performance(df_monthly_performance, factor_name, 'monthly', 'a')
def _calc_synthetic_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的合成因子的载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷,该参数只在end_date不为None时有效,并且不论end_date是否为None,都会计算第一天的因子载荷 :param save: 是否保存至因子数据库,默认为False :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False 'com_factors': list, 成分因子的类实例list :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0: ID, 证券ID,为索引 1: factorvalue, 因子载荷 """ # 取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 遍历交易日序列, 计算合成因子下各个成分因子的因子载荷 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 计算各成分因子的因子载荷 # for com_factor in eval('risk_ct.' + cls.__name__.upper() + '_CT')['component']: # factor = eval(com_factor + '()') # factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) for com_factor in kwargs['com_factors']: com_factor.calc_factor_loading(start_date=calc_date, end_date=None, month_end=month_end, save=save, multi_proc=kwargs['multi_proc']) # 计算合成因子 synthetic_factor = pd.DataFrame() df_industry_classify = Utils.get_industry_classify() # 个股行业分类数据 for com_factor in eval('risk_ct.' + cls.__name__.upper() + '_CT')['component']: factor_path = os.path.join( factor_ct.FACTOR_DB.db_path, eval('risk_ct.' + com_factor + '_CT')['db_file']) factor_loading = Utils.read_factor_loading( factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) factor_loading.drop(columns='date', inplace=True) factor_loading.rename(columns={'factorvalue': com_factor}, inplace=True) # 添加行业分类数据 factor_loading = pd.merge( left=factor_loading, right=df_industry_classify[['id', 'ind_code']], how='inner', on='id') # 取得含缺失值的因子载荷数据 missingdata_factor = factor_loading[ factor_loading[com_factor].isna()] # 删除factor_loading中的缺失值 factor_loading.dropna(axis='index', how='any', inplace=True) # 对factor_loading去极值、标准化 factor_loading = Utils.normalize_data(factor_loading, id='id', columns=com_factor, treat_outlier=True, weight='cap', calc_date=calc_date) # 把missingdata_factor中的缺失值替换为行业均值 ind_codes = set(missingdata_factor['ind_code']) ind_mean_factor = {} for ind_code in ind_codes: ind_mean_factor[ind_code] = factor_loading[ factor_loading['ind_code'] == ind_code][com_factor].mean() for idx, missingdata in missingdata_factor.iterrows(): missingdata_factor.loc[idx, com_factor] = ind_mean_factor[ missingdata['ind_code']] # 把missingdata_factor和factor_loading合并 factor_loading = pd.concat( [factor_loading, missingdata_factor]) # 删除ind_code列 factor_loading.drop(columns='ind_code', inplace=True) # merge成分因子 if synthetic_factor.empty: synthetic_factor = factor_loading else: synthetic_factor = pd.merge(left=synthetic_factor, right=factor_loading, how='inner', on='id') # 合成因子 synthetic_factor.set_index('id', inplace=True) weight = pd.Series( eval('risk_ct.' + cls.__name__.upper() + '_CT')['weight']) synthetic_factor = (synthetic_factor * weight).sum(axis=1) synthetic_factor.name = 'factorvalue' synthetic_factor.index.name = 'id' synthetic_factor = pd.DataFrame(synthetic_factor) synthetic_factor.reset_index(inplace=True) synthetic_factor['date'] = Utils.get_trading_days(start=calc_date, ndays=2)[1] # 保存synthetic_factor因子载荷 if save: Utils.factor_loading_persistent( cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), synthetic_factor.to_dict('list'), ['date', 'id', 'factorvalue'])
def _calc_factor_loading(cls, code, calc_date): """ 计算指定日期、指定个股日内各时点动量值 Parameters -------- :param code: str 个股代码,如600000或SH600000 :param calc_date: datetime-like, str 因子载荷计算日期,格式YYYY-MM-DD :return: pd.Series -------- 日内个时点的动量值,各个index对应的含义如下: 0. m0: 隔夜时段动量 1. m1: 第一个小时动量 2. m2: 第二个小时动量 3. m3: 第三个小时动量 4. m4: 第四个小时动量 5. m_normal: 传统动量 若计算失败,返回None """ # 取得过去90天的交易日序列,按日期降序排列 trading_days = Utils.get_trading_days(end=calc_date, ndays=90, ascending=False) # 取得个股过去90天中的最近21天的1分钟行情数据,根据每天的分钟行情读取日内5个时点的价格,并计算日内收益值 mkt_data = DataFrame() mkt_data_header = ['date', 'p0930', 'p1030', 'p1130', 'p1400', 'p1500'] intra_day_ret = DataFrame() ret_header = ['date', 'r0', 'r1', 'r2', 'r3', 'r4'] k = 0 for trading_day in trading_days: df_1m_mkt = Utils.get_min_mkt(code, trading_day, fq=True) if df_1m_mkt is None: continue # 计算日内5个时点的价格 time_label = '%s 09:31:00' % trading_day.strftime('%Y-%m-%d') p0930 = df_1m_mkt[df_1m_mkt.datetime == time_label].iloc[0].open time_label = '%s 10:30:00' % trading_day.strftime('%Y-%m-%d') p1030 = df_1m_mkt[df_1m_mkt.datetime == time_label].iloc[0].close time_label = '%s 11:30:00' % trading_day.strftime('%Y-%m-%d') p1130 = df_1m_mkt[df_1m_mkt.datetime == time_label].iloc[0].close time_label = '%s 14:00:00' % trading_day.strftime('%Y-%m-%d') p1400 = df_1m_mkt[df_1m_mkt.datetime == time_label].iloc[0].close time_label = '%s 15:00:00' % trading_day.strftime('%Y-%m-%d') p1500 = df_1m_mkt[df_1m_mkt.datetime == time_label].iloc[0].close s = Series([trading_day, p0930, p1030, p1130, p1400, p1500], index=mkt_data_header) mkt_data = mkt_data.append(s, ignore_index=True) # 计算日内收益 if k > 0: r0 = math.log(mkt_data.iloc[k - 1].p0930 / mkt_data.iloc[k].p1500) r1 = math.log(mkt_data.iloc[k - 1].p1030 / mkt_data.iloc[k - 1].p0930) r2 = math.log(mkt_data.iloc[k - 1].p1130 / mkt_data.iloc[k - 1].p1030) r3 = math.log(mkt_data.iloc[k - 1].p1400 / mkt_data.iloc[k - 1].p1130) r4 = math.log(mkt_data.iloc[k - 1].p1500 / mkt_data.iloc[k - 1].p1400) # r0 = mkt_data.iloc[k - 1].p0930 / mkt_data.iloc[k].p1500 -1.0 # r1 = mkt_data.iloc[k - 1].p1030 / mkt_data.iloc[k - 1].p0930 - 1.0 # r2 = mkt_data.iloc[k - 1].p1130 / mkt_data.iloc[k - 1].p1030 - 1.0 # r3 = mkt_data.iloc[k - 1].p1400 / mkt_data.iloc[k - 1].p1130 - 1.0 # r4 = mkt_data.iloc[k - 1].p1500 / mkt_data.iloc[k - 1].p1400 - 1.0 s = Series([mkt_data.iloc[k - 1].date, r0, r1, r2, r3, r4], index=ret_header) intra_day_ret = intra_day_ret.append(s, ignore_index=True) k += 1 if k > cls.__days: break if k <= cls.__days: return None intra_day_ret = intra_day_ret.sort_values(by='date') # mkt_data = mkt_data.sort_values(by='date') # mkt_data = mkt_data.reset_index(drop=True) # 计算传统动量因子值,=过去20日的涨跌幅 m_normal = math.log(mkt_data.iloc[0].p1500 / mkt_data.iloc[-1].p1500) # m_normal = mkt_data.iloc[0].p1500 / mkt_data.iloc[-1].p1500 - 1.0 # 遍历上述取得的行情数据,计算每日的日内收益值 # intra_day_ret = DataFrame() # ret_header = ['date', 'r0', 'r1', 'r2', 'r3', 'r4'] # for k in range(1, len(mkt_data)): # r0 = math.log(mkt_data.iloc[k].p0930 / mkt_data.iloc[k-1].p1500) # r1 = math.log(mkt_data.iloc[k].p1030 / mkt_data.iloc[k].p0930) # r2 = math.log(mkt_data.iloc[k].p1130 / mkt_data.iloc[k].p1030) # r3 = math.log(mkt_data.iloc[k].p1400 / mkt_data.iloc[k].p1130) # r4 = math.log(mkt_data.iloc[k].p1500 / mkt_data.iloc[k].p1400) # s = Series([mkt_data.iloc[k].date, r0, r1, r2, r3, r4], index=ret_header) # intra_day_ret = intra_day_ret.append(s, ignore_index=True) intra_day_ret = intra_day_ret.set_index('date') # 个股的日内各时点的动量因子值等于过去20个交易日各个r_i累加 intra_day_momentum = intra_day_ret.sum() intra_day_momentum.index = ['m0', 'm1', 'm2', 'm3', 'm4'] intra_day_momentum['m_normal'] = m_normal return intra_day_momentum
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本股的因子载荷,并保存至因子数据库 Parameters: -------- :param start_date: datetime-like or str 开始日期,格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like or str 结束日期,格式:YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认True 如果为True,则只计算月末时点的因子载荷;否则每个交易日都计算 :param save: bool, 默认False 是否保存至因子数据库 :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date: 日期 1. id: 证券symbol 2. LnTotalMktCap: 总市值对数值 3. LnLiquidMktCap: 流通市值对数值 """ # 取得交易日序列股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算规模因子值 dict_scale = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_scale = { 'date': [], 'id': [], 'LnTotalMktCap': [], 'LnLiquidMktCap': [] } # 遍历个股,计算个股规模因子值 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 采用单进程进行计算规模因子 # for _, stock_info in stock_basics.iterrows(): # scale_data = cls._calc_factor_loading(stock_info.symbol, calc_date) # if scale_data is not None: # logging.info("[%s] %s's total mkt cap = %.0f, liquid mkt cap = %.0f" % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, scale_data.LnTotalMktCap, scale_data.LnLiquidMktCap)) # dict_scale['id'].append(Utils.code_to_symbol(stock_info.symbol)) # dict_scale['LnTotalMktCap'].append(round(scale_data.LnTotalMktCap, 4)) # dict_scale['LnLiquidMktCap'].append(round(scale_data.LnLiquidMktCap, 4)) # 采用多进程并行计算规模因子 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的规模因子值 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): scale_data = q.get(True) dict_scale['id'].append(scale_data[0]) dict_scale['LnTotalMktCap'].append(round(scale_data[1], 4)) dict_scale['LnLiquidMktCap'].append(round(scale_data[2], 4)) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_scale['date'] = [date_label] * len(dict_scale['id']) # 保存规模因子载荷至因子数据库 if save: Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_scale) # 休息60秒 logging.info('Suspending for 60s.') time.sleep(60) return dict_scale
def _calc_Orthogonalized_factorloading(factor_name, start_date, end_date=None, month_end=True, save=False): """ 计算alpha因子经正交化后的因子载荷 Parameters: -------- :param factor_name: str alpha因子名称, e.g: SmartMoney :param start_date: datetime-like, str 开始日期, e.g: YYYY-MM-DD, YYYYMMDD :param end_date: datetime-like, str, 默认None 结束日期, e.g: YYYY-MM-DD, YYYYMMDD :param month_end: bool, 默认True 是否只计算月末日期的因子载荷 :param save: bool, 默认False 是否保存计算结果 :return: dict -------- 因子经正交化后的因子载荷 0. date, 为计算日期的下一个交易日 1. id, 证券代码 2. factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) CRiskModel = Barra() orthog_factorloading = {} for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 读取目标因子原始载荷经标准化后的载荷值 target_factor_path = os.path.join( SETTINGS.FACTOR_DB_PATH, eval('alphafactor_ct.' + factor_name.upper() + '_CT')['db_file'], 'standardized', factor_name) df_targetfactor_loading = Utils.read_factor_loading( target_factor_path, Utils.datetimelike_to_str(calc_date, dash=False), drop_na=True) df_targetfactor_loading.drop(columns='date', inplace=True) df_targetfactor_loading.rename(columns={'factorvalue': factor_name}, inplace=True) # 读取风险模型中的风格因子载荷矩阵 df_stylefactor_loading = CRiskModel.get_StyleFactorloading_matrix( calc_date) df_stylefactor_loading.renmae(columns={'code': 'id'}, inplace=True) # 读取alpha因子载荷矩阵数据(经正交化后的载荷值) df_alphafactor_loading = pd.DataFrame() for alphafactor_name in alphafactor_ct.ALPHA_FACTORS: if alphafactor_name == factor_name: break factorloading_path = os.path.join( SETTINGS.FACTOR_DB_PATH, eval('alphafactor_ct.' + alphafactor_name.upper() + '_CT')['db_file'], 'orthogonalized', alphafactor_name) factor_loading = Utils.read_factor_loading( factorloading_path, Utils.datetimelike_to_str(calc_date, dash=False), drop_na=True) factor_loading.drop(columns='date', inplace=True) factor_loading.rename(columns={'factorvalue': alphafactor_name}, inplace=True) if df_alphafactor_loading.empty: df_alphafactor_loading = factor_loading else: df_alphafactor_loading = pd.merge(left=df_alphafactor_loading, right=factor_loading, how='inner', on='id') # 合并目标因子载荷、风格因子载荷与alpha因子载荷 df_factorloading = pd.merge(left=df_targetfactor_loading, right=df_stylefactor_loading, how='inner', on='id') if not df_alphafactor_loading.empty: df_factorloading = pd.merge(left=df_stylefactor_loading, right=df_alphafactor_loading, how='inner', on='id') # 构建目标因子载荷向量、风格与alpha因子载荷矩阵 df_factorloading.set_index('id', inplace=True) arr_targetfactor_loading = np.array(df_factorloading[factor_name]) stylealphafactor_names = df_factorloading.columns.tolist() stylealphafactor_names.remove(factor_name) arr_stylealphafactor_loading = np.array( df_factorloading[stylealphafactor_names]) # 将arr_targetfactor_loading对arr_stylealphafactor_loading进行截面回归, 得到的残差即为正交化后的因子载荷 Y = arr_targetfactor_loading X = sm.add_constant(arr_stylealphafactor_loading) model = sm.OLS(Y, X) results = model.fit() datelabel = Utils.get_trading_days(start=calc_date, ndays=2)[1] orthog_factorloading = { 'date': [datelabel] * len(results.resid), 'id': df_factorloading.index.tolist(), 'factorvalue': results.resid } # 保存正交化后的因子载荷 if save: orthog_factorloading_path = os.path.join( SETTINGS.FACTOR_DB_PATH, eval('alphafactor_ct.' + factor_name.upper() + '_CT')['db_file'], 'orthogonalized', factor_name) Utils.factor_loading_persistent( orthog_factorloading_path, Utils.datetimelike_to_str(calc_date, dash=False), orthog_factorloading, ['date', 'id', 'factorvalue']) return orthog_factorloading
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认为True 是否保存至因子数据库 :param kwargs: 'multi_proc': bool, True=采用多进程, False=采用单进程, 默认为False :return: dict 因子载荷数据 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算LIQUIDITY因子载荷 dict_raw_liquidity = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_stom = None dict_stoq = None dict_stoa = None dict_raw_liquidity = None logging.info('[%s] Calc LIQUIDITY factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股,计算个股LIQUIDITY因子值 s = (calc_date - datetime.timedelta( days=risk_ct.LIQUID_CT.listed_days)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] ids = [] stoms = [] stoqs = [] stoas = [] raw_liquidities = [] if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程计算LIQUIDITY因子值 for _, stock_info in stock_basics.iterrows(): logging.info("[%s] Calc %s's LIQUIDITY factor loading." % (Utils.datetimelike_to_str( calc_date, dash=True), stock_info.symbol)) liquidity_data = cls._calc_factor_loading( stock_info.symbol, calc_date) if liquidity_data is not None: ids.append(liquidity_data['code']) stoms.append(liquidity_data['stom']) stoqs.append(liquidity_data['stoq']) stoas.append(liquidity_data['stoa']) raw_liquidities.append(liquidity_data['liquidity']) else: # 采用多进程计算LIQUIDITY因子值 q = Manager().Queue() p = Pool(4) for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): liquidity_data = q.get(True) ids.append(liquidity_data['code']) stoms.append(liquidity_data['stom']) stoqs.append(liquidity_data['stoq']) stoas.append(liquidity_data['stoa']) raw_liquidities.append(liquidity_data['liquidity']) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_stom = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoms }) dict_stoq = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoqs }) dict_stoa = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': stoas }) dict_raw_liquidity = dict({ 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': raw_liquidities }) # 读取Size因子值, 将流动性因子与Size因子正交化 size_factor_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.SIZE_CT.db_file) df_size = Utils.read_factor_loading( size_factor_path, Utils.datetimelike_to_str(calc_date, dash=False)) df_size.drop(columns='date', inplace=True) df_size.rename(columns={'factorvalue': 'size'}, inplace=True) df_liquidity = pd.DataFrame( dict({ 'id': ids, 'liquidity': raw_liquidities })) df_liquidity = pd.merge(left=df_liquidity, right=df_size, how='inner', on='id') arr_liquidity = Utils.normalize_data( Utils.clean_extreme_value( np.array(df_liquidity['liquidity']).reshape( (len(df_liquidity), 1)))) arr_size = Utils.normalize_data( Utils.clean_extreme_value( np.array(df_liquidity['size']).reshape( (len(df_liquidity), 1)))) model = sm.OLS(arr_liquidity, arr_size) results = model.fit() df_liquidity['liquidity'] = results.resid df_liquidity.drop(columns='size', inplace=True) df_liquidity.rename(columns={'liquidity': 'factorvalue'}, inplace=True) df_liquidity['date'] = date_label # 保存因子载荷 if save: str_date = Utils.datetimelike_to_str(calc_date, dash=False) factor_header = ['date', 'id', 'factorvalue'] Utils.factor_loading_persistent(cls._db_file, 'stom_{}'.format(str_date), dict_stom, factor_header) Utils.factor_loading_persistent(cls._db_file, 'stoq_{}'.format(str_date), dict_stoq, factor_header) Utils.factor_loading_persistent(cls._db_file, 'stoa_{}'.format(str_date), dict_stoa, factor_header) Utils.factor_loading_persistent( cls._db_file, 'rawliquidity_{}'.format(str_date), dict_raw_liquidity, factor_header) Utils.factor_loading_persistent(cls._db_file, str_date, df_liquidity.to_dict('list'), factor_header) # 暂停180秒 logging.info('Suspending for 180s.') time.sleep(180) return dict_raw_liquidity
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like or str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式:YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认True 是否保存至因子数据库 :param kwargs: :return: dict 因子载荷 -------- """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算筹码分布因子载荷 dict_cyq = {} for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc CYQ factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股, 计算个股筹码分布因子值 s = (calc_date - datetime.timedelta(days=180)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] secu_cyq_path = Path( factor_ct.FACTOR_DB.db_path, factor_ct.CYQ_CT.db_file, 'secu_cyq/%s' % calc_date.strftime('%Y-%m-%d')) if not secu_cyq_path.exists(): secu_cyq_path.mkdir() ids = [] rps = [] # 采用单进程计算筹码分布数据, 及当前价格的相对位置(=当前价格-平均成本)/平均成本 # for _, stock_info in stock_basics.iterrows(): # logging.info("[%s] Calc %s's cyq data." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) # secu_cyq = cls._calc_factor_loading(stock_info.symbol, calc_date) # if secu_cyq is not None: # secu_code, secu_close, cyq_data = secu_cyq # # 保存个股的筹码分布数据 # cyq_data.to_csv(Path(secu_cyq_path, '%s.csv' % secu_code), header=True) # # 计算当前价格的相对位置 # avg_cyq = np.sum(np.array(cyq_data.index) * np.array(cyq_data.values)) # relative_position = round((secu_close - avg_cyq) / avg_cyq, 4) # ids.append(secu_code) # rps.append(relative_position) # 采用多进程进行并行计算筹码分布数据, 及当前价格的相对位置(=当前价格-平均成本)/平均成本 q = Manager().Queue() # 队列, 用于进程间通信, 存储每个进程计算的因子载荷 p = Pool(4) # 进程池, 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): secu_cyq = q.get(True) secu_code, secu_close, cyq_data = secu_cyq # 保存个股的筹码分布数据 cyq_data.to_csv(Path(secu_cyq_path, '%s.csv' % secu_code), header=True) # 计算当前价格的相对位置 avg_cyq = np.sum( np.array(cyq_data.index) * np.array(cyq_data.values)) relative_position = round((secu_close - avg_cyq) / avg_cyq, 4) ids.append(secu_code) rps.append(relative_position) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_cyq = { 'date': [date_label] * len(ids), 'id': ids, 'factorvalue': rps } if save: cyq_data_path = os.path.join(factor_ct.FACTOR_DB.db_path, factor_ct.CYQ_CT.db_file, factor_ct.CYQ_CT.CYQ_rp_file) Utils.factor_loading_persistent( cyq_data_path, Utils.datetimelike_to_str(calc_date, dash=False), dict_cyq, ['date', 'id', 'factorvalue']) # 休息90秒 logging.info('Suspending for 100s.') time.sleep(100) return dict_cyq
def _calc_MVPFP(factor_name, start_date, end_date=None, month_end=True, save=False): """ 构建目标因子的最小波动纯因子组合(Minimum Volatility Pure Factor Portfolio, MVPFP) Parameters: -------- :param factor_name: str alpha因子名称, e.g: SmartMoney :param start_date: datetime-like, str 开始日期, e.g: YYYY-MM-DD, YYYYMMDD :param end_date: datetime-like, str, 默认为None 结束日期, e.g: YYYY-MM-DD, YYYYMMDD :param month_end: bool, 默认为True 是否只计算月末日期的因子载荷 :param save: bool, 默认为False 是否保存计算结果 :return: CWeightHolding类 最小波动纯因子组合权重数据 -------- 具体优化算法:暴露1单位目标因子敞口, 同时保持其余所有风险因子的敞口为0, 并具有最小预期波动率的组合 Min: W'VW s.t. W'X_beta = 0 W'x_target = 1 其中: W: 最小波动纯因子组合对应的权重 V: 个股协方差矩阵 X_beta: 个股风格因子载荷矩阵 x_target: 个股目标因子载荷向量 """ start_date = Utils.to_date(start_date) if end_date is None: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) else: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) CRiskModel = Barra() mvpfp_holding = CWeightHolding() for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue # 取得/计算calc_date的个股协方差矩阵数据 stock_codes, arr_stocks_covmat = CRiskModel.calc_stocks_covmat( calc_date) # 取得个股风格因子载荷矩阵数据 df_stylefactor_loading = CRiskModel.get_StyleFactorloading_matrix( calc_date) # df_stylefactor_loading.set_index('code', inplace=True) # df_stylefactor_loading = df_stylefactor_loading.loc[stock_codes] # 按个股顺序重新排列 # arr_stylefactor_loading = np.array(df_stylefactor_loading) # 取得个股目标因子载荷向量数据(正交化后的因子载荷) df_targetfactor_loading = _get_factorloading( factor_name, calc_date, alphafactor_ct.FACTORLOADING_TYPE['ORTHOGONALIZED']) df_targetfactor_loading.drop(columns='date', inplace=True) df_targetfactor_loading.rename(columns={ 'id': 'code', 'factorvalue': factor_name }, inplace=True) df_factorloading = pd.merge(left=df_stylefactor_loading, right=df_targetfactor_loading, how='inner', on='code') df_factorloading.set_index('code', inplace=True) df_stylefactor_loading = df_factorloading.loc[ stock_codes, riskfactor_ct.STYLE_RISK_FACTORS] arr_stylefactor_laoding = np.array(df_stylefactor_loading) df_targetfactor_loading = df_factorloading.loc[stock_codes, factor_name] arr_targetfactor_loading = np.array(df_targetfactor_loading) # 优化计算最小波动纯因子组合权重 V = arr_stocks_covmat X_beta = arr_stylefactor_laoding x_target = arr_targetfactor_loading N = len(stock_codes) w = cvx.Variable((N, 1)) risk = cvx.quad_form(w, V) constraints = [ cvx.matmul(w.T, X_beta) == 0, cvx.matmul(w.T, x_target) == 1 ] prob = cvx.Problem(cvx.Minimize(risk), constraints) prob.solve() if prob.status == cvx.OPTIMAL: datelabel = Utils.datetimelike_to_str(calc_date, dash=False) df_holding = pd.DataFrame({ 'date': [datelabel] * len(stock_codes), 'code': stock_codes, 'weight': w.value }) mvpfp_holding.from_dataframe(df_holding) if save: holding_path = os.path.join( SETTINGS.FACTOR_DB_PATH, eval('alphafactor_ct.' + factor_name.upper() + '.CT')['db_file'], 'mvpfp', '{}_{}.csv'.format(factor_name, datelabel)) mvpfp_holding.save_data(holding_path) else: raise cvx.SolverError( "%s优化计算%s最小纯因子组合失败。" % (Utils.datetimelike_to_str(calc_date), factor_name)) return mvpfp_holding
def _calc_factor_loading1(cls, code, calc_date): """ 计算指定日期、指定个股筹码分布的四个代理变量以及下一期(下个月)的收益率 Parameters ------- :param code: str 个股代码, 如600000或SH600000 :param calc_date: datetime-like, str 计算日期, 格式YYYY-MM-DD :return: pd.Series -------- 个股筹码分布的额四个代理变量 0. arc: 筹码分布的均值 1. vrc: 筹码分布的方差 2. src: 筹码分布的偏度 3. krc: 筹码分布的峰度 4. next_ret: 下一期的收益率 若计算失败, 返回None """ # 读取过去__days天的个股复权日K线行情数据 df_mkt = Utils.get_secu_daily_mkt(code, end=calc_date, ndays=cls.__days, fq=True, range_lookup=True) if df_mkt is None: return None if len(df_mkt) < 20: return None # 按日期降序排列行情数据 df_mkt.sort_values(by='date', ascending=False, inplace=True) # 遍历行情数据, 计算RC(相对资本收益)向量和ATR(调整换手率)向量 arr_rc = np.zeros(len(df_mkt)) arr_atr = np.zeros(len(df_mkt)) p_c = df_mkt.iloc[0]['close'] # 截止日期的收盘价 for j in range(len(df_mkt)): p_avg = df_mkt.iloc[j]['amount'] / df_mkt.iloc[j][ 'vol'] * df_mkt.iloc[j]['factor'] arr_rc[j] = (p_c - p_avg) / p_c tr_j = df_mkt.iloc[j]['turnover1'] if j == 0: arr_atr[j] = tr_j else: arr_atr[j] = arr_atr[j - 1] / pre_tr * tr_j * (1. - pre_tr) pre_tr = tr_j arc = np.average(arr_rc, weights=arr_atr) if np.isnan(arc): return None rc_dev = arr_rc - arc n = len(df_mkt) vrc = n / (n - 1.) * np.sum( arr_atr * rc_dev * rc_dev) / np.sum(arr_atr) if np.isnan(vrc): return None src = n / (n - 1.) * np.sum(arr_atr * np.float_power( rc_dev, 3)) / np.sum(arr_atr) / np.float_power(vrc, 1.5) if np.isnan(src): return None krc = n / (n - 1.) * np.sum(arr_atr * np.float_power( rc_dev, 4)) / np.sum(arr_atr) / np.float_power(vrc, 2) if np.isnan(krc): return None # 计算个股下一期的收益率 # next_date = calc_date + datetime.timedelta(days=1) next_date = Utils.get_trading_days(start=calc_date, ndays=2)[1] wday, month_range = calendar.monthrange(next_date.year, next_date.month) date_end = datetime.datetime(next_date.year, next_date.month, month_range) next_ret = Utils.calc_interval_ret(code, start=next_date, end=date_end) if next_ret is None: return None else: return pd.Series([arc, vrc, src, krc, next_ret], index=['arc', 'vrc', 'src', 'krc', 'next_ret'])
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters -------- :param start_date: datetime-like, str 开始日期 :param end_date: datetime-like, str,默认None 结束日期,如果为None,则只计算start_date日期的因子载荷 :param month_end: bool,默认True 只计算月末时点的因子载荷 :param save: 是否保存至因子数据库,默认为False :param kwargs: 'multi_proc': bool, True=采用多进程并行计算, False=采用单进程计算, 默认为False :return: 因子载荷,DataFrame -------- 因子载荷,DataFrame 0. date, 日期, 为计算日期的下一个交易日 1: id, 证券代码 2: factorvalue, 因子载荷 如果end_date=None,返回start_date对应的因子载荷数据 如果end_date!=None,返回最后一天的对应的因子载荷数据 如果没有计算数据,返回None """ # 0.取得交易日序列 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # 取得样本个股信息 # all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算SMartQ因子载荷 dict_factor = None for calc_date in trading_days_series: dict_factor = {'id': [], 'factorvalue': []} if month_end and (not Utils.is_month_end(calc_date)): continue # 1.获取用于读取分钟行情的交易日列表(过去30天的交易日列表,降序排列) # trading_days = _get_trading_days(calc_date, 30) # trading_days = Utils.get_trading_days(end=calc_date, ndays=30, ascending=False) # 2.取得样本个股信息 # stock_basics = ts.get_stock_basics() s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = Utils.get_stock_basics(s) # 3.遍历样本个股代码,计算Smart_Q因子载荷值 dict_factor = {'date': None, 'id': [], 'factorvalue': []} if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程进行计算 for _, stock_info in stock_basics.iterrows(): # code = '%s%s' % ('SH' if code[:2] == '60' else 'SZ', code) factor_loading = cls._calc_factor_loading( stock_info.symbol, calc_date) print( "[%s]Calculating %s's SmartMoney factor loading = %.4f." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, -1.0 if factor_loading is None else factor_loading)) if factor_loading is not None: # df_factor.ix[code, 'factorvalue'] = factor_loading dict_factor['id'].append( Utils.code_to_symbol(stock_info.symbol)) dict_factor['factorvalue'].append(factor_loading) else: # 采用多进程并行计算SmartQ因子载荷 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷值 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): smart_q = q.get(True) dict_factor['id'].append(smart_q[0]) dict_factor['factorvalue'].append(smart_q[1]) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_factor['date'] = [date_label] * len(dict_factor['id']) # 4.计算去极值标准化后的因子载荷 df_std_factor = Utils.normalize_data(pd.DataFrame(dict_factor), columns='factorvalue', treat_outlier=True, weight='eq') # 5.保存因子载荷至因子数据库 if save: # Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_factor) cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_factor, 'SmartMoney', factor_type='raw', columns=['date', 'id', 'factorvalue']) cls._save_factor_loading(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), df_std_factor, 'SmartMoney', factor_type='standardized', columns=['date', 'id', 'factorvalue']) # 休息300秒 logging.info('Suspending for 360s.') time.sleep(360) return dict_factor
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷,并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期,格式:YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str; 默认为None 结束日期,格式:YYYY-MM-DD or YYYYMMDD 如果为None,则只计算start_date日期的因子载荷 :param month_end: bool, 默认True 如果为True,则只计算月末时点的因子载荷 :param save: bool, 默认False 是否保存至因子数据库 :return: 因子载荷,pd.DataFrame -------- 因子载荷,pd.DataFrame 0. date: 日期 1. id: 日期 2. npg_ttm: 净利润增长率_TTM 3. opg_ttm: 营业收入增长率_TTM """ # 取得交易日序列及股票基本信息表 trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列,计算价值因子载荷 dict_growth = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue dict_growth = {'date': [], 'id': [], 'npg_ttm': [], 'opg_ttm': []} # 遍历个股,计算个股成长因子载荷 s = (calc_date - datetime.timedelta(days=90)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] # 采用单进程进行计算成长因子 # for _, stock_info in stock_basics.iterrows(): # logging.info("[%s] calc %s's growth factor loading." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) # growth_data = cls._calc_factor_loading(stock_info.symbol, calc_date) # if growth_data is not None: # dict_growth['id'].append(Utils.code_to_symbol(stock_info.symbol)) # dict_growth['npg_ttm'].append(growth_data['npg_ttm']) # dict_growth['opg_ttm'].append(growth_data['opg_ttm']) # 采用多进程并行计算成长因子 q = Manager().Queue() # 队列,用于进程间通信,存储每个进程计算的因子载荷 p = Pool(4) # 进程池,最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=(stock_info.symbol, calc_date, q,)) p.close() p.join() while not q.empty(): growth_data = q.get(True) dict_growth['id'].append(growth_data['id']) dict_growth['npg_ttm'].append(growth_data['npg_ttm']) dict_growth['opg_ttm'].append(growth_data['opg_ttm']) date_label = Utils.get_trading_days(start=calc_date, ndays=2)[1] dict_growth['date'] = [date_label] * len(dict_growth['id']) # 保存因子载荷至因子数据库 if save: columns = ['date', 'id', 'npg_ttm', 'opg_ttm'] Utils.factor_loading_persistent(cls._db_file, calc_date.strftime('%Y%m%d'), dict_growth, columns) # 休息120秒 logging.info('Suspending for 120s.') time.sleep(120) return dict_growth
def calc_factor_loading1(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like or str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式:YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认True 是否保存至因子数据库 :param kwargs: :return: dict 因子载荷 -------- """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算筹码分布因子载荷 dict_cyq = None for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc CYQ factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股, 计算个股筹码分布因子值 df_proxies = DataFrame() s = (calc_date - datetime.timedelta(days=365)).strftime('%Y%m%d') stock_basics = all_stock_basics[all_stock_basics.list_date < s] trading_day = Utils.get_trading_days(calc_date, ndays=2)[1] # 采用单进程计算筹码因子分布的代理变量 # for _, stock_info in stock_basics.iterrows(): # cyq_proxies = cls._calc_factor_loading(stock_info.symbol, calc_date) # if cyq_proxies is not None: # logging.info("[%s] %s's cyq proxies = (%0.4f,%0.4f,%0.4f,%0.4f,%0.4f)" % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol, cyq_proxies['arc'], cyq_proxies['vrc'], cyq_proxies['src'], cyq_proxies['krc'], cyq_proxies['next_ret'])) # # cyq_proxies['date'] = trading_day # cyq_proxies['id'] = Utils.code_to_symbol(stock_info.symbol) # df_proxies = df_proxies.append(cyq_proxies, ignore_index=True) # 采用多进程进行并行计算筹码分布因子的代理变量 q = Manager().Queue() # 队列, 用于进程间通信, 存储每个进程计算的因子载荷 p = Pool(4) # 进程池, 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=( stock_info.symbol, calc_date, q, )) p.close() p.join() while not q.empty(): cyq_proxies = q.get(True) # cyq_proxies['date'] = trading_day df_proxies = df_proxies.append(cyq_proxies, ignore_index=True) # 保存筹码分布代理变量数据 df_proxies['date'] = trading_day proxies_file_path = cls._db_proxies_path + '_%s.csv' % Utils.datetimelike_to_str( calc_date, dash=False) df_proxies.to_csv( proxies_file_path, index=False, columns=['date', 'id', 'arc', 'vrc', 'src', 'krc', 'next_ret']) # 导入筹码分布因子的代理变量数据 # cyq_proxies_path = cls._db_proxies_path + '_%s.csv' % Utils.datetimelike_to_str(calc_date, dash=False) # df_proxies = pd.read_csv(cyq_proxies_path, header=0) # 计算marc, 代理变量权重及筹码分布因子载荷 marc = df_proxies['arc'].median() proxies_weight_file = Path(factor_ct.FACTOR_DB.db_path, factor_ct.CYQ_CT.proxies_weight_file) if proxies_weight_file.exists(): df_proxies_weight = pd.read_csv(proxies_weight_file, header=0, parse_dates=[0]) df_proxies_weight = df_proxies_weight[ df_proxies_weight.date < calc_date].tail(24) if len(df_proxies_weight) < 24: with open(proxies_weight_file, 'a', newline='') as f: csv_writer = csv.writer(f) csv_writer.writerow([ calc_date.strftime('%Y-%m-%d'), marc, 0, 0, 0, 0, 0 ]) else: df_proxies_data = DataFrame() if marc > 0: df_proxies_weight = df_proxies_weight[ df_proxies_weight.marc > 0] elif marc < 0: df_proxies_weight = df_proxies_weight[ df_proxies_weight.marc < 0] for _, weight_info in df_proxies_weight.iterrows(): proxies_file_path = cls._db_proxies_path + '_%s.csv' % Utils.datetimelike_to_str( weight_info['date'], False) df_proxies_data = df_proxies_data.append( pd.read_csv(proxies_file_path, header=0), ignore_index=True) next_ret = np.array(df_proxies_data['next_ret']) cyq_data = np.array( df_proxies_data[['arc', 'vrc', 'src', 'krc']]) cyq_data = sm.add_constant(cyq_data) cyq_model = sm.OLS(next_ret, cyq_data) cyq_result = cyq_model.fit() cyq_weights = np.around(cyq_result.params, 6) with open(proxies_weight_file, 'a', newline='') as f: csv_writer = csv.writer(f) csv_writer.writerow([ calc_date.strftime('%Y-%m-%d'), marc, cyq_weights[0], cyq_weights[1], cyq_weights[2], cyq_weights[3], cyq_weights[4] ]) # 计算筹码分布因子载荷 arr_proxies = np.array( df_proxies[['arc', 'vrc', 'src', 'krc']]) arr_weight = np.array([ cyq_weights[1], cyq_weights[2], cyq_weights[3], cyq_weights[4] ]).reshape((4, 1)) intercept = cyq_weights[0] arr_cyq = np.around( np.dot(arr_proxies, arr_weight) + intercept, 6) dict_cyq = { 'date': list(df_proxies['date']), 'id': list(df_proxies['id']), 'factorvalue': list(arr_cyq.reshape((len(arr_cyq), ))) } # 保存因子载荷至因子数据库 if save: Utils.factor_loading_persistent( cls._db_file, calc_date.strftime('%Y%m%d'), dict_cyq, columns=['date', 'id', 'factorvalue']) else: with open(proxies_weight_file, 'w', newline='') as f: csv_writer = csv.writer(f) csv_writer.writerow([ 'date', 'marc', 'intcpt', 'arc_w', 'vrc_w', 'src_w', 'krc_w' ]) csv_writer.writerow( [calc_date.strftime('%Y-%m-%d'), marc, 0, 0, 0, 0, 0]) # 休息300秒 logging.info('Suspending for 200s.') time.sleep(200)
def calc_factor_loading(cls, start_date, end_date=None, month_end=True, save=False, **kwargs): """ 计算指定日期的样本个股的因子载荷, 并保存至因子数据库 Parameters: -------- :param start_date: datetime-like, str 开始日期, 格式: YYYY-MM-DD or YYYYMMDD :param end_date: datetime-like, str 结束日期, 如果为None, 则只计算start_date日期的因子载荷, 格式: YYYY-MM-DD or YYYYMMDD :param month_end: bool, 默认为True 如果为True, 则只计算月末时点的因子载荷 :param save: bool, 默认True 是否保存至因子数据库 :param kwargs: :return: dict 因子载荷 """ # 取得交易日序列及股票基本信息表 start_date = Utils.to_date(start_date) if end_date is not None: end_date = Utils.to_date(end_date) trading_days_series = Utils.get_trading_days(start=start_date, end=end_date) else: trading_days_series = Utils.get_trading_days(end=start_date, ndays=1) # all_stock_basics = CDataHandler.DataApi.get_secu_basics() # 遍历交易日序列, 计算筹码分布因子载荷 dict_beta = {} dict_hsigma = {} for calc_date in trading_days_series: if month_end and (not Utils.is_month_end(calc_date)): continue logging.info('[%s] Calc BETA factor loading.' % Utils.datetimelike_to_str(calc_date)) # 遍历个股, 计算个股BETA因子值 # s = (calc_date - datetime.timedelta(days=risk_ct.DBETA_CT.listed_days)).strftime('%Y%m%d') # stock_basics = all_stock_basics[all_stock_basics.list_date < s] s = calc_date - datetime.timedelta(days=risk_ct.DBETA_CT.listed_days) stock_basics = Utils.get_stock_basics(s, False) ids = [] # 个股代码list betas = [] # BETA因子值 hsigmas = [] # HSIGMA因子值 if 'multi_proc' not in kwargs: kwargs['multi_proc'] = False if not kwargs['multi_proc']: # 采用单进程计算BETA因子和HSIGMA因子值, for _, stock_info in stock_basics.iterrows(): logging.debug("[%s] Calc %s's BETA and HSIGMA factor data." % (calc_date.strftime('%Y-%m-%d'), stock_info.symbol)) beta_data = cls._calc_factor_loading(stock_info.symbol, calc_date) if beta_data is None: ids.append(Utils.code_to_symbol(stock_info.symbol)) betas.append(np.nan) hsigmas.append(np.nan) else: ids.append(beta_data['code']) betas.append(beta_data['beta']) hsigmas.append(beta_data['hsigma']) else: # 采用多进程并行计算BETA因子和HSIGMA因子值 q = Manager().Queue() # 队列, 用于进程间通信, 存储每个进程计算的因子载荷 p = Pool(SETTINGS.CONCURRENCY_KERNEL_NUM) # 进程池, 最多同时开启4个进程 for _, stock_info in stock_basics.iterrows(): p.apply_async(cls._calc_factor_loading_proc, args=(stock_info.symbol, calc_date, q,)) p.close() p.join() while not q.empty(): beta_data = q.get(True) ids.append(beta_data['code']) betas.append(beta_data['beta']) hsigmas.append(beta_data['hsigma']) date_label = Utils.get_trading_days(calc_date, ndays=2)[1] dict_beta = {'date': [date_label]*len(ids), 'id': ids, 'factorvalue': betas} dict_hsigma = {'date': [date_label]*len(ids), 'id': ids, 'factorvalue': hsigmas} if save: Utils.factor_loading_persistent(cls._db_file, Utils.datetimelike_to_str(calc_date, dash=False), dict_beta, ['date', 'id', 'factorvalue']) hsigma_path = os.path.join(factor_ct.FACTOR_DB.db_path, risk_ct.HSIGMA_CT.db_file) Utils.factor_loading_persistent(hsigma_path, Utils.datetimelike_to_str(calc_date, dash=False), dict_hsigma, ['date', 'id', 'factorvalue']) # 休息180秒 # logging.info('Suspending for 180s.') # time.sleep(180) return dict_beta