def feature_models(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], saved_feature=True, bc=None, top_ratio=0.25, bottom_ratio=0.2): root = get_source_root() feature_mapping = _get_source_features() date_periods = _get_in_out_dates( start_date=start_date, end_date=end_date, security_id=bc) or [[start_date, end_date]] next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1) idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=next_date, source=0).get(bc) all_factor_returns = [] for _start_date, _end_date in date_periods: logger.info('get factor return: processing from {0} to {1}'.format( _start_date, _end_date)) next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2) if bc: security_ids = get_idx_cons_jy(bc, _start_date, _end_date) else: security_ids = get_security_codes() # FIXME HACK FOR TESTING security_ids = security_ids[:5] ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) ret_returns = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) ret_mv = get_market_value(security_ids=security_ids, start_date=_start_date, end_date=next_date) for date, val in ret_features.items(): daily_factors = [] daily_return = [] daily_mv = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) daily_factors.append(_val_lst) try: s_label = ret_returns.get(sec_id).get(str(date)) i_label = idx_labels.get(str(date)) label = (s_label - i_label) * 100 mv = ret_mv.get(sec_id).get(str(date)) except Exception as ex: label = np.nan logger.error( 'fail to calculate the label with error:{0}'.format( ex)) daily_return.append(label) daily_mv.append(mv) try: daily_factors = feature_preprocessing(arr=daily_factors, fill_none=True, weights=daily_mv) except Exception as ex: logger.error( 'fail in feature preprocessing with error:{0}'.format(ex)) # indus_factors = get_indust_vectors(security_ids) courtry_factors = np.ones(len(security_ids)).reshape( len(security_ids), 1) factor_returns = factor_return_regression( courtry_factors, industry_exposure_factors, daily_factors, daily_return) all_factor_returns.append(factor_returns) mean_returns = np.array(all_factor_returns).mean(axis=0) return mean_returns
def get_factor_returns(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], saved_feature=True, bc=None, top_ratio=0.25, bottom_ratio=0.2): ''' multi factor framework, regression to calculate the factor returns :param start_date: :param end_date: :param data_source: :param feature_types: :param saved_feature: :param bc: :param top_ratio: :param bottom_ratio: :return: ''' root = get_source_root() feature_mapping = _get_source_features() or get_source_feature_mappings( souce=True, feature_types=feature_types, top_ratio=top_ratio, bottom_ratio=bottom_ratio) date_periods = _get_in_out_dates( start_date=start_date, end_date=end_date, security_id='000300.XSHG') or [[start_date, end_date]] next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1) idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=next_date, source=0).get(bc) all_factor_returns = [] for _start_date, _end_date in date_periods: logger.info('get factor return: processing from {0} to {1}'.format( _start_date, _end_date)) next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2) if bc: security_ids = get_idx_cons_jy(bc, _start_date, _end_date) else: security_ids = get_security_codes() # FIXME HACK FOR TESTING security_ids = security_ids[:5] ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) ret_returns = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) ret_mv = get_market_value(security_ids=security_ids, start_date=_start_date, end_date=next_date) none_factor_dict = defaultdict() # FIXME use the future industry return, should be updated to trace back the history data by windows industry_exposure_factors = get_indust_exposures_time_series_regression( start_date=_start_date, end_date=_end_date, stock_returns=ret_returns) _industry_exposure_df = pd.DataFrame(industry_exposure_factors) _industry_exposure_df.to_csv('{0}_{1}_{2}.csv'.format( _start_date, _end_date, bc)) for date, val in ret_features.items(): daily_factors = [] daily_return = [] daily_mv = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) daily_factors.append(_val_lst) try: s_label = ret_returns.get(sec_id).get(str(date)) i_label = idx_labels.get(str(date)) label = (s_label - i_label) * 100 mv = ret_mv.get(sec_id).get(str(date)) except Exception as ex: label = np.nan logger.error( 'fail to calculate the label with error:{0}'.format( ex)) daily_return.append(label) daily_mv.append(mv) try: daily_factors = feature_preprocessing(arr=daily_factors, fill_none=True, weights=daily_mv) except Exception as ex: logger.error( 'fail in feature preprocessing with error:{0}'.format(ex)) # indus_factors = get_indust_vectors(security_ids) courtry_factors = np.ones(len(security_ids)).reshape( len(security_ids), 1) factor_returns = factor_return_regression( courtry_factors, industry_exposure_factors, daily_factors, daily_return) all_factor_returns.append(factor_returns) mean_returns = np.array(all_factor_returns).mean(axis=0) return mean_returns
def feature_models(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], saved_feature=True, bc=None, top_ratio=0.25, bottom_ratio=0.2): root = get_source_root() feature_mapping = _get_source_features() feature_shape = [(1, len(item)) for item in list(feature_mapping.values())] # FIXME remove hardcode indust_shape = (1, 17) date_periods = _get_in_out_dates( start_date=start_date, end_date=end_date, security_id=bc) or [[start_date, end_date]] next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1) idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=next_date, source=0).get(bc) all_factor_returns = [] # FIXME change to load existing model m = TFMultiFactor() m.build_model(feature_shape=feature_shape, indust_shape=indust_shape) for _start_date, _end_date in date_periods: logger.info('get factor return: processing from {0} to {1}'.format( _start_date, _end_date)) _next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2) if bc: security_ids = get_idx_cons_jy(bc, _start_date, _end_date) else: security_ids = get_security_codes() # FIXME HACK FOR TESTING security_ids = security_ids[:50] ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) ret_returns = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) ret_mv = get_market_value(security_ids=security_ids, start_date=_start_date, end_date=_next_date) # FIXME industry vector updates industry_exposure_factors, industry_name = get_indust_exposures_corr( start_date=_start_date, end_date=_end_date, stock_returns=ret_returns, period=120, bc_returns=idx_labels) for date, val in ret_features.items(): daily_factors = [] daily_return = [] daily_mv = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) daily_factors.append(_val_lst) try: s_label = ret_returns.get(sec_id).get(str(date)) i_label = idx_labels.get(str(date)) label = (s_label - i_label) * 100 mv = ret_mv.get(sec_id).get(str(date)) except Exception as ex: label = np.nan logger.error( 'fail to calculate the label with error:{0}'.format( ex)) daily_return.append(label) daily_mv.append(mv) try: daily_factors = feature_preprocessing(arr=daily_factors, fill_none=True, weights=daily_mv) except Exception as ex: logger.error( 'fail in feature preprocessing with error:{0}'.format(ex)) # TODO country factors TBD courtry_factors = np.ones(len(security_ids)).reshape( len(security_ids), 1) m.train_model( daily_factors, np.array(daily_return).reshape(len(daily_return), 1), industry_exposure_factors, 2, 5, ) return
def retrieve_features(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], bc='000300.XSHG'): feature_mapping = get_source_feature_mappings(feature_types=feature_types) date_periods = _get_in_out_dates(start_date=start_date, end_date=end_date, security_id='000300.XSHG') or [ [start_date, end_date]] all_labels = [] all_features = [] all_feature_names = [] g_next_date = w.tdaysoffset(1, end_date).Data[0][0].strftime('%Y%m%d') idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=g_next_date, source=0).get(bc) _w_ret = w.tdays(start_date, g_next_date) tdays = [item.strftime(config['constants']['standard_date_format']) for item in _w_ret.Data[0]] tdays = sorted(tdays) for _start_date, _end_date in date_periods: next_date = _next_trading_date(tdays, _end_date) if not next_date: logger.error("Fail to get the next trading date for :{0}".format(_end_date)) security_ids = get_idx_cons_dy(bc, _start_date) # FIXME add some filter,e.g. halt sec logger.info("Start query the features from :{0} to {1}....".format(_start_date, _end_date)) ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) logger.info("Complete query the features from :{0} to {1}".format(_start_date, _end_date)) logger.info("Start query the market returns from :{0} to {1}....".format(_start_date, next_date)) ret_labels = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) logger.info("Complete query the market returns from :{0} to {1}".format(_start_date, next_date)) logger.info("Start calculate the features from :{0} to {1}....".format(_start_date, _end_date)) for date, val in ret_features.items(): logger.info("Starting processing for date:{0}....".format(date)) date_features = [] date_labels = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) date_features.append(_val_lst) try: _next_date = _next_trading_date(tdays, str(date)) s_label = ret_labels.get(sec_id).get(str(_next_date)) i_label = idx_labels.get(str(_next_date)) label = s_label * 100 - i_label except Exception as ex: label = np.nan logger.error('Fail to calculate the label with error:{0}'.format(ex)) date_labels.append(label) try: date_features = feature_preprocessing(arr=date_features, fill_none=True, trade_date=date, sec_ids=list(val.keys()), neutralized=False) except Exception as ex: logger.error('Fail in feature preprocessing with error:{0}'.format(ex)) logger.info('Adding sec_id and trade_dates and reshape...') try: df_shape = date_features.shape date_features = np.column_stack((date_features, list(val.keys()), [date] * df_shape[0])) if isinstance(list(val.keys())[0], int): print('checking here') if 'XS' in [date] * df_shape[0][0]: print('checking here') all_features.extend(date_features) all_labels.extend(date_labels) except Exception as ex: logger.error('fail to reshape features features with error:{0}'.format(ex)) logger.info("Complete processing for date:{0}".format(date)) logger.info("Complete calculate the features from :{0} to {1}".format(_start_date, _end_date)) try: if 'SECURITY_ID' not in all_feature_names: all_feature_names.append('SECURITY_ID') if 'TRADE_DATE' not in all_feature_names: all_feature_names.append('TRADE_DATE') df = pd.DataFrame(all_features, columns=all_feature_names) df['LABEL'] = all_labels except Exception as ex: logger.error(ex) del ret_features del ret_labels gc.collect() return df
def retrieve_features(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], bc='000300.XSHG'): # feature_mapping = get_source_feature_mappings(feature_types=feature_types) date_periods = _get_in_out_dates(start_date=start_date, end_date=end_date, security_id='000300.XSHG') or [ [start_date, end_date]] all_labels = [] all_features = [] all_feature_names = [] g_next_date = w.tdaysoffset(1, end_date).Data[0][0].strftime('%Y%m%d') idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=g_next_date, source=0).get(bc) _w_ret = w.tdays(start_date, g_next_date) tdays = [item.strftime(config['constants']['standard_date_format']) for item in _w_ret.Data[0]] tdays = sorted(tdays) for _start_date, _end_date in date_periods: next_date = _next_trading_date(tdays, _end_date) if not next_date: logger.error("Fail to get the next trading date for :{0}".format(_end_date)) security_ids = get_idx_cons_dy(bc, _start_date) # FIXME add some filter,e.g. halt sec logger.info("Start query the features from :{0} to {1}....".format(_start_date, _end_date)) mkt = Market(security_ids) # mkt.security_ids = ['000651.XSHE'] mkt.initialize(start_date=_start_date, end_date=next_date, source=0, tick_cache='') ret_features = get_min_features(security_ids=security_ids, start_date=_start_date, end_date=_end_date, start_time=None, end_time=None, mkt_cache=mkt, features=hf_features) # ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, # start_date=_start_date, # end_date=_end_date, source=data_source) logger.info("Complete query the features from :{0} to {1}".format(_start_date, _end_date)) logger.info("Start query the market returns from :{0} to {1}....".format(_start_date, next_date)) ret_labels = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) logger.info("Complete query the market returns from :{0} to {1}".format(_start_date, next_date)) logger.info("Start calculate the features from :{0} to {1}....".format(_start_date, _end_date)) for date, val in ret_features.items(): logger.info("Starting processing for date:{0}....".format(date)) date_features = [] date_labels = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) date_features.append(_val_lst) try: _next_date = _next_trading_date(tdays, str(date)) s_label = ret_labels.get(sec_id).get(str(_next_date)) i_label = idx_labels.get(str(_next_date)) # label = s_label * 100 - i_label label = s_label * 100 except Exception as ex: label = np.nan logger.error('Fail to calculate the label with error:{0}'.format(ex)) date_labels.append(label) try: date_features = feature_preprocessing(arr=date_features, fill_none=True, trade_date=date, sec_ids=list(val.keys()), neutralized=False) except Exception as ex: logger.error('Fail in feature preprocessing with error:{0}'.format(ex)) logger.info('Adding sec_id and trade_dates and reshape...') try: df_shape = date_features.shape # df_shape = [len(date_features), len(date_features[0])] date_features = np.column_stack((date_features, list(val.keys()), [date] * df_shape[0])) all_features.extend(date_features) all_labels.extend(date_labels) except Exception as ex: logger.error('fail to reshape features features with error:{0}'.format(ex)) logger.info("Complete processing for date:{0}".format(date)) logger.info("Complete calculate the features from :{0} to {1}".format(_start_date, _end_date)) try: if 'SECURITY_ID' not in all_feature_names: all_feature_names.append('SECURITY_ID') if 'TRADE_DATE' not in all_feature_names: all_feature_names.append('TRADE_DATE') df = pd.DataFrame(all_features, columns=all_feature_names) df['LABEL'] = all_labels except Exception as ex: logger.error(ex) del ret_features del ret_labels gc.collect() df.to_csv('hf_feature_6m_abs.csv') corr_var = np.corrcoef([float(item) for item in list(df['ret_var'])], df['LABEL']) corr_skew = np.corrcoef([float(item) for item in list(df['ret_skew'])], df['LABEL']) corr_kurtosis = np.corrcoef([float(item) for item in list(df['ret_kurtosis'])], df['LABEL']) print(df.corr()) return df, corr_var, corr_skew, corr_kurtosis