def get_market_prediction(start_date='20050103', end_date='20181231', input_period=12, predict_period=6): root = get_source_root() path = os.path.join(get_source_root(), 'data', 'macro_indicators.xlsx') df = pd.read_excel(path) econ_ids, cat = list(df['指标ID']), list(df['分项']) factor_cats = dict(zip(econ_ids, cat)) pmi_path = os.path.join(root, 'data', 'features', 'PMI.xls') # pmi_rows = pd.read_excel(pmi_path).values dates, factors, ids = get_econ_data(start_date=start_date, end_date=end_date) df = DataFetcher(source=0) rows, desc = df.get_data_fetcher_obj().get_mkt_equd( security_ids=['000300.XSHG'], start_date=start_date, end_date=end_date, asset_type='idx', fields=['CLOSE_INDEX', 'PRE_CLOSE_INDEX']) train_y = [] ma_lst = [] # econ_month_lst = [item.strftime('%Y%m') for item in dates] # n_pmi = len(pmi_lst) # train_x = [pmi_lst[idx: idx + period] for idx in range(n_pmi - period)] for item in factors: ma_lst.append( ta.SMA(np.array(item, dtype=float), input_period)[input_period - 1:-1]) ma_lst = np.array(ma_lst).transpose() close_idx = desc.index('CLOSE_INDEX') pre_close_idx = desc.index('PRE_CLOSE_INDEX') trade_date_idx = desc.index('TRADE_DATE') monthly_return = {} tmp_rows = {} mon_dates = get_all_month_start_end_dates(start_date, end_date) for item in rows: tmp_rows.update( {item[trade_date_idx].strftime('%Y%m%d'): item[close_idx]}) for som, eom in mon_dates: k = som[:6] v = (tmp_rows.get(eom) - tmp_rows.get(som)) / tmp_rows.get(som) monthly_return.update({k: v}) for m in dates[input_period:]: # train_y.append(0 if monthly_return.get(m) < 0 else 1) tmp = monthly_return.get(m[:6]) or 0.0 train_y.append(0.0 if tmp < 0 else 1.0) # ma_lst = [[sum(item) / len(item)] for item in train_x] factor_names = [factor_cats.get(idx) for idx in ids] scores = feature_selection_sort(x=ma_lst, y=train_y, feature_names=factor_names, sort_type='pearson') df = pd.DataFrame(scores, columns=['相关性', '指标名']) df.to_csv('scores_{0}_months.csv'.format(input_period)) return scores
def get_selected_features(): root = get_source_root() feature_source = os.path.join(os.path.realpath(root), 'data', 'features') # files = os.listdir(feature_source) # files = [item for item in files if item.startswith('score')] # get the score with the corresponding start and end date, otherwise return the latest one # TODO confirm whether the listdir function's return is sorted by time # _path = 'score_{0}_{1}'.format(start_date, end_date) if 'score_{0}_{1}'.format(start_date, end_date) in files else \ # files[-1] # _path = files[0] _score_path = os.path.join( feature_source, 'score{0}_{1}_{2}.csv'.format( config['feature_mining_strategy']['benchmark'].split('.')[0], config['feature_mining_strategy']['start_date'], config['feature_mining_strategy']['end_date'])) # logger.info("Reading score from path:{0}".format(_score_path)) score_df = pd.read_csv(_score_path) score_df = score_df.sort_values(by='score', ascending=False) _feature_names = list(score_df['feature']) _score_bound = int( len(_feature_names) * float(config['feature_mining_strategy']['best_feature_ratio']) / 2) feature_names = list( set(_feature_names[:_score_bound + 1]).union( set(_feature_names[-_score_bound:]))) return feature_names
def get_tick_l2_features(security_id='', trade_date='', start_time='', end_time=''): root = get_source_root() l2_path = os.path.join(os.path.realpath(root), 'data', 'features', 'level2', trade_date) tick_path = os.path.join( l2_path, 'l2tick', '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date)) tick_df = pd.read_csv(tick_path) _start_time_int = int(start_time) * 1000 _end_time_int = int(end_time) * 1000 _tick_df = tick_df[tick_df.Time >= _start_time_int] _tick_df = _tick_df[_tick_df.Time < _end_time_int] _tick_df.drop_duplicates(keep='last', inplace=True) _tick_df[ 'acc_bid_volume5'] = _tick_df.BidVolume1 + _tick_df.BidVolume2 + _tick_df.BidVolume3 + _tick_df.BidVolume4 + _tick_df.BidVolume5 _tick_df[ 'acc_offer_volume5'] = _tick_df.OfferVolume1 + _tick_df.OfferVolume2 + _tick_df.OfferVolume3 + _tick_df.OfferVolume4 + _tick_df.OfferVolume5 _tick_df['quote_imbalance5'] = _tick_df['acc_bid_volume5'] - _tick_df[ 'acc_offer_volume5'] _tick_df['amplitude1'] = (_tick_df['High'] - _tick_df['Low']) / _tick_df['LastPrice'] ret = cal_tick_l2_features(df=_tick_df, features=[ 'quote_imbalance5', 'quote_imbalance5_std', 'ampl', 'ampl_std' ]) return ret, _tick_df
def __init__(self): root = get_source_root() # get the file name of the features feature_source = os.path.join(os.path.realpath(root), 'data', 'features', 'cache_features') try: self._conn = sqlite.connect(feature_source) except Exception as ex: traceback.print_exc()
def get_significant_features(top_ratio=0.5, bottom_ratio=0.2): root = get_source_root() tops = [] bottoms = [] ret = defaultdict(dict) score_lst = [] f_types = set() root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) # for start_date, end_date in [('20180103', '20181230'), ('20140603', '20160103'), ('20160103', '20171230')]: for start_date, end_date in [('20150103', '20181231')]: corr_path = os.path.join( os.path.realpath(root), 'conf', 'score_{0}_{1}.csv'.format(start_date, end_date)) df = pd.read_csv(corr_path) score_ret = defaultdict(list) for idx, k, s, ft in df.values: _tmp = score_ret.get(ft) or list() _tmp.append([k, s]) score_ret.update({ft: _tmp}) f_types.add(ft) for ft, val in score_ret.items(): logger.debug(ft, len(val)) val.sort(key=lambda x: x[1], reverse=True) t_dict = ret.get(ft) or dict() top_dict = t_dict.get('top_features') or list() bottom_dict = t_dict.get('bottom_features') or list() top_idx = int(len(val) * top_ratio) if int(len(val) * top_ratio) > 5 else 5 bottom_idx = int( len(val) * bottom_ratio) if int(len(val) * bottom_ratio) > 1 else 1 top_dict.extend(item[0] for item in val[:top_idx]) bottom_dict.extend(item[0] for item in val[-bottom_idx:]) t_dict.update({'top_features': top_dict}) t_dict.update({'bottom_features': bottom_dict}) ret.update({ft: t_dict}) for ft, val in ret.items(): top_lst = list(set(val['top_features'])) bottom_lst = list(set(val['bottom_features'])) top_lst.extend(bottom_lst) ret.update({ft: top_lst}) return ret
def result_analysis(): result = pd.read_pickle('output_result.pkl') pprint.pprint(result) # margin_lst = list(result['future_positions']['margin']) for k, v in list(result.items())[1:]: r = get_source_root() result_path = os.path.join(r, 'data', 'results', '{0}.csv'.format(k)) print('save the result to:{0}'.format(result_path)) v.to_csv(result_path)
def get_source_feature_mappings(feature_types=None): root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) if not feature_types: return feature_mapping _tmp = copy.deepcopy(feature_mapping) for k, v in _tmp.items(): if not k in feature_types: feature_mapping.pop(k) return feature_mapping
def get_econ_data(start_date='', end_date='', fields=None): path = os.path.join(get_source_root(), 'data', 'macro_indicators.xlsx') df = pd.read_excel(path) ids = fields or list(df['指标ID']) print(len(ids)) # ret = w.edb(','.join(ids), start_date, end_date, "Fill=Previous") ret = w.edb(codes="", beginTime=start_date, endTime=end_date, options="Fill=Previous") dates = [item.strftime('%Y%m%d') for item in ret.Times] return dates, ret.Data, ids
def train_features(start_date='', end_date='', bc='000300.XSHG'): # rows, desc = read_features() # _df = pd.DataFrame(rows, columns=desc) root = get_source_root() df = load_cache_features(start_date, end_date, bc) cols = list(df.columns)[:-4] cols.append('LABEL') df_corr = df[cols].corr() score_df = pd.DataFrame( {'feature': list(df_corr.iloc[:, -1].index)[:-1], 'score': list(df_corr.iloc[:, -1].values)[:-1]}) score_path = os.path.join(os.path.realpath(root), 'data', 'features', 'score{0}_{1}_{2}.csv'.format(bc.split('.')[0], start_date, end_date)) score_df.to_csv(score_path, index=None) return df, score_df
def get_sw1_indust_code(sec_ids=[], trade_date=''): root = get_source_root() indust_path = os.path.join(os.path.join(root, 'data'), 'features', 'sw1_indust.csv') df = pd.read_csv(indust_path) ret = {} for sec_id in sec_ids: _df = df[df.secID == sec_id] for idx, sec_id, in_code, into_date, out_date, is_new in list(_df.values): if not is_new: if into_date.replace('-', '') <= str(trade_date) and str(trade_date) <= out_date.replace('-', ''): ret.update({sec_id: in_code}) continue else: if into_date.replace('-', '') <= str(trade_date): ret.update({sec_id: in_code}) return ret
def feature_refine(): root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) score_file = 'testing_train_features_score_20160103_20171230.csv' score_file = os.path.join(os.path.realpath(root), 'conf', score_file) df = pd.read_csv(score_file) values = df.values type_lst = [] for item in values: f, s = item[1:] for k, v in feature_mapping.items(): if f in v: type_lst.append(k) df['feature_type'] = type_lst save_file = os.path.join(os.path.realpath(root), 'conf', 'score_20160103_20171230.csv') df.to_csv(save_file)
def initialize(self, start_date='', end_date='', source=0, tick_cache='qto'): _df = g_db_fetcher.get_data_fetcher_obj(source) _min_mkt = _df.get_mkt_mins(startdate=start_date, enddate=end_date, sec_codes=self.security_ids, table_name='CUST.EQUITY_PRICEMIN') df = pd.DataFrame(_min_mkt[0], columns=_min_mkt[1]) close_lst = list(df['CLOSEPRICE']) close_lst.insert(0, close_lst[0]) return_lst = [] n_row = len(close_lst) for idx in range(n_row - 1): return_lst.append( (close_lst[idx + 1] - close_lst[idx]) / close_lst[idx]) df['RETURN'] = return_lst df['RET2VOL'] = df['RETURN'] / df['VOLUME'] self.min_cache = df #retrieve data from _w_ret = w.tdays(start_date, end_date) t_dates = list( set([item.strftime('%Y%m%d') for item in _w_ret.Data[0]])) if 'q' in tick_cache and self.security_ids and t_dates: df_lst = [] # security_id = self.security_ids[0] # trade_date = t_dates[0] for security_id in self.security_ids: for trade_date in t_dates: root = get_source_root() l2_path = os.path.join(os.path.realpath(root), 'data', 'features', 'level2', trade_date) tick_path = os.path.join( l2_path, 'l2tick', '{0}_{1}.csv'.format( security_id.split('.')[0], trade_date)) df_lst.append(pd.read_csv(tick_path)) if df_lst: self.q_cache = df_lst for _tmp in df_lst: self.q_cache.append(_tmp) del df_lst
def get_l2_features(security_id='', trade_date='', start_time='', end_time=''): root = get_source_root() l2_path = os.path.join(os.path.realpath(root), 'data', 'features', 'level2', trade_date) order_path = os.path.join( l2_path, 'l2order', '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date)) tick_path = os.path.join( l2_path, 'l2tick', '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date)) trade_path = os.path.join( l2_path, 'l2trade', '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date)) tick_df = pd.read_csv(tick_path) order_df = pd.read_csv(order_path) trade_df = pd.read_csv(trade_path) print(order_df.shape) _start_time_int = int(start_time) _end_time_int = int(end_time)
def load_cache_features(start_date='', end_date='', bc='000300.XSHG'): print('Loading features from :{0} to {1}'.format(start_date, end_date)) _w_ret = w.tdays(start_date, end_date) # t_months = list(set([item.strftime('%Y%m') for item in _w_ret.Data[0]])) t_years = list(set([item.strftime('%y') for item in _w_ret.Data[0]])) root = get_source_root() feature_paths = [os.path.join(os.path.realpath(root), 'data', 'features', 'features{0}_{1}.csv'.format(bc.split('.')[0], m_date)) for m_date in t_years] if feature_paths: # logger.info('Reading features:{0}'.format(feature_paths[0])) df = pd.read_csv(feature_paths[0]) for p in feature_paths[1:]: # logger.info('Reading features:{0}'.format(p)) # _df = pd.read_csv(p) # df = df.append(_df) _df = pd.read_csv(p) df = df.append(_df) df = df[df.TRADE_DATE >= int(start_date.replace('-', ''))] df = df[df.TRADE_DATE < int(end_date.replace('-', ''))] return df
def create_features_table(): db = SQLiteHelper() root = get_source_root() # get the file name of the features feature_mapping_source = os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json') feature_mapping = load_json_file(feature_mapping_source) _vals = list(feature_mapping.values()) fields = [] for item in _vals: fields.extend(item) table_name = 'FEATURE_CACHE' s1 = "CREATE TABLE {0} (TICKER_SYMBOL INT, TRADE_DATE TEXT,SECURITY_ID TEXT,D_LABEL REAL,M_LABEL REAL, ".format( table_name) for f in fields: s1 += "{0} REAL,".format(f) s1 = s1[:-1] + ')' print(s1) try: db.execute_sql(s1) except Exception as ex: print(ex)
def show_feature_correlate(): root = get_source_root() tops = [] bottoms = [] for start_date, end_date in [('20180103', '20181231'), ('20140603', '20160202'), ('20160103', '20171231')]: corr_path = os.path.join(os.path.realpath(root), 'conf', 'feature_score_{0}_{1}.csv'.format(start_date, end_date)) df = pd.read_csv(corr_path) df = df.sort_values(by='score', ascending=False) df_top = df[df.score > 0.1] tops.append(list(df_top['feature'])) df_bottom = df[df.score < -0.1] bottoms.append(list(df_bottom['feature'])) t = set(tops[0]) t = t.union(set(tops[1])) t = t.union(set(tops[2])) b = set(bottoms[0]) b = b.union(set(bottoms[1])) b = b.union(set(bottoms[2])) t = t.union(b) return list(t)
def get_feature_heatmap(dates=[], bc='000300.XSHG'): root = get_source_root() source_path = os.path.join(os.path.realpath(root), 'data', 'features') _bc = bc.split('.')[0] _files = os.listdir(source_path) files = [item for item in _files if item.startswith('score') and _bc in item] y_vvalues = [item.split('.')[-2].split('_')[1] for item in files] f_mapping = get_source_feature_mappings() ret_f_score = defaultdict(list) for file in files: f_type_score = defaultdict(list) df = pd.read_csv(os.path.join(source_path, file)) for item in list(df.values): for _type, _lst in f_mapping.items(): if item[0] in _lst: f_type_score[_type].append(item[1]) for k, v in f_type_score.items(): ret_f_score[k].append(sum(v) / len(v)) pprint.pprint(ret_f_score) x = [] x_vvalues = list(ret_f_score.keys()) for k, v in ret_f_score.items(): x.append(v) ax = sns.heatmap(pd.DataFrame(x, index=x_vvalues, columns=y_vvalues)) ax.set_yticklabels(ax.get_yticklabels(), fontsize=6) ax.set_xticklabels(ax.get_xticklabels(), rotation=90) # generate the score # scores = [] # for start_date, end_date in dates: # df, score_df = train_features(start_date=start_date, end_date=end_date, bc=bc) # scores.append(score_df['score']) # scores = np.array(scores).transpose() # ax = sns.heatmap(scores) plt.savefig('feature_heatmap_{0}.jpg'.format(_bc))
def cache_features(start_date='20180101', end_date='20181231', data_source=0, feature_types=[], bc='000300.XSHG', sufix=None): ''' cache the features :param start_date: :param end_date: :param data_source: :param feature_types: :param bc: :return: ''' logger.info("Start retrieve features from {0} to {1}....".format(start_date, end_date)) df = retrieve_features(start_date=start_date, end_date=end_date, data_source=data_source, feature_types=feature_types, bc=bc) logger.info("Complete retrieve features from {0} to {1}".format(start_date, end_date)) # save_features(tuple(df.columns), tuple(df.values)) root = get_source_root() # df.to_pickle(feature_source) logger.info("Start saving the features....") if sufix: feature_source = os.path.join(os.path.realpath(root), 'data', 'features', 'features{0}_{1}.csv'.format(bc.split('.')[0], sufix)) df.to_csv(feature_source) del df return df['MONTH'] = [item[:6] for item in df['TRADE_DATE']] m_dates = set(df['MONTH']) n_mdates = len(m_dates) for idx, m_date in enumerate(m_dates): _df = df[df.MONTH == m_date] feature_source = os.path.join(os.path.realpath(root), 'data', 'features', 'features{0}_{1}.csv'.format(bc.split('.')[0], m_date)) logger.info('Saving the {0} th features {1} out of {2}'.format(idx, feature_source, n_mdates)) _df.to_csv(feature_source, index=None) del df gc.collect()
from collections import defaultdict from quant_models.utils.helper import get_source_root from quant_models.utils.helper import get_config from quant_models.utils.helper import get_parent_dir from quant_models.applications.feature_mining.model_selection import get_selected_features from quant_models.applications.feature_mining.model_selection import train_models from quant_models.applications.feature_mining.feature_selection import load_cache_features from quant_models.data_processing.features_calculation import get_sw_indust from sklearn import decomposition from sklearn.externals import joblib model_name = 'linear' config = get_config() strategy_config = config['feature_mining_strategy'] # TODO change the path of the backtesting results root = get_source_root() # get the file name of the features # _feature_path = os.path.join(os.path.realpath(root), 'data', 'features', 'feature_mining_strategy') # w.start() def init(context): # model_path = os.path.join(get_parent_dir(), 'data', 'models', 'stock_selection_{0}'.format(model_name)) model_path = os.path.join(get_parent_dir(), 'data', 'models', 'ridge_20150103_20181231_0.9_000905.ZICN') feature_names = get_selected_features() context.features = load_cache_features(__config__['base']['start_date'], __config__['base']['end_date'], __config__['base']['benchmark']) context.model = joblib.load(model_path)
def get_equity_daily_features(security_ids=[], features={'ma': ['ACD6', 'ACD20']}, start_date=20181101, end_date=20181102, trade_date=None, source=0): logger.info( 'Start calculate features from {0} to {1} for sec_ids:{2} and features types{3}' .format(start_date, end_date, len(security_ids), len(features))) ret_features = defaultdict(dict) # query on one date if trade_date: start_date = end_date = trade_date if isinstance(start_date, str): start_date = int(start_date) if isinstance(end_date, str): end_date = int(end_date) retrieve_feature_names = list() for f_type, f_val in features.items(): retrieve_feature_names.extend(f_val) retrieve_feature_names = list(set(retrieve_feature_names)) root = get_source_root() feature_mapping = load_json_file( os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')) source_features = [] for item in list(feature_mapping.values()): source_features.extend(item) cal_features = list(set(retrieve_feature_names) - set(source_features)) _df = g_db_fetcher.get_data_fetcher_obj(source) excluded = [ 'CREATE_TIME', 'UPDATE_TIME', 'TMSTAMP', 'ID', 'SECURITY_ID_INT', 'SECURITY_ID', 'TRADE_DATE', 'TICKER_SYMBOL' ] retrieve_feature_names = [item.upper() for item in retrieve_feature_names] for f_type, f_fields in features.items(): rows, desc = _df.get_equ_factor(fields=f_fields, factor_type=f_type, security_ids=security_ids, start_date=start_date, end_date=end_date) # logger.info('Complete querying') id_idx = desc.index('SECURITY_ID') date_idx = desc.index('TRADE_DATE') # logger.info('start processing rows for factor type:{0}'.format(f_type)) if not f_fields: continue for item in rows: sec_id, date = item[id_idx], item[date_idx] date_dict = ret_features[date] or {} if date_dict and sec_id in date_dict: curr_dict = date_dict[sec_id] else: curr_dict = {} date_dict[sec_id] = {} idx_lst = [] for idx, val in enumerate(desc): if val.upper() in retrieve_feature_names: idx_lst.append(idx) # idx_lst = [idx for idx, val in enumerate(desc) if val.upper() in retrieve_feature_names] tmp_lst = [item[idx] for idx in idx_lst] tmp_dict = dict(zip([desc[idx] for idx in idx_lst], tmp_lst)) tmp_dict1 = copy.deepcopy(tmp_dict) for k1, v1 in tmp_dict1.items(): if k1 in excluded: tmp_dict.pop(k1) # add the pre-defined calculated featuers tmp_dict = get_cal_features(tmp_dict, cal_features) if tmp_dict: curr_dict.update(tmp_dict) if curr_dict: ret_features[date][sec_id] = curr_dict # logger.info('complete processing rows for factor type:{0}'.format(f_type)) del rows gc.collect() time.sleep(3) for date, val in ret_features.items(): for sec_id, _val in val.items(): _keys = set(_val.keys()) _add_keys = set(retrieve_feature_names) - _keys _remove_keys = _keys - set(retrieve_feature_names) for _k in _add_keys: _val.update({_k: None}) for _k in _remove_keys: _val.pop(_k) # FIXME check whether the length of the features are the same now return ret_features
def feature_models(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], saved_feature=True, bc=None, top_ratio=0.25, bottom_ratio=0.2): root = get_source_root() feature_mapping = _get_source_features() date_periods = _get_in_out_dates( start_date=start_date, end_date=end_date, security_id=bc) or [[start_date, end_date]] next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1) idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=next_date, source=0).get(bc) all_factor_returns = [] for _start_date, _end_date in date_periods: logger.info('get factor return: processing from {0} to {1}'.format( _start_date, _end_date)) next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2) if bc: security_ids = get_idx_cons_jy(bc, _start_date, _end_date) else: security_ids = get_security_codes() # FIXME HACK FOR TESTING security_ids = security_ids[:5] ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) ret_returns = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) ret_mv = get_market_value(security_ids=security_ids, start_date=_start_date, end_date=next_date) for date, val in ret_features.items(): daily_factors = [] daily_return = [] daily_mv = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) daily_factors.append(_val_lst) try: s_label = ret_returns.get(sec_id).get(str(date)) i_label = idx_labels.get(str(date)) label = (s_label - i_label) * 100 mv = ret_mv.get(sec_id).get(str(date)) except Exception as ex: label = np.nan logger.error( 'fail to calculate the label with error:{0}'.format( ex)) daily_return.append(label) daily_mv.append(mv) try: daily_factors = feature_preprocessing(arr=daily_factors, fill_none=True, weights=daily_mv) except Exception as ex: logger.error( 'fail in feature preprocessing with error:{0}'.format(ex)) # indus_factors = get_indust_vectors(security_ids) courtry_factors = np.ones(len(security_ids)).reshape( len(security_ids), 1) factor_returns = factor_return_regression( courtry_factors, industry_exposure_factors, daily_factors, daily_return) all_factor_returns.append(factor_returns) mean_returns = np.array(all_factor_returns).mean(axis=0) return mean_returns
def get_factor_returns(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], saved_feature=True, bc=None, top_ratio=0.25, bottom_ratio=0.2): ''' multi factor framework, regression to calculate the factor returns :param start_date: :param end_date: :param data_source: :param feature_types: :param saved_feature: :param bc: :param top_ratio: :param bottom_ratio: :return: ''' root = get_source_root() feature_mapping = _get_source_features() or get_source_feature_mappings( souce=True, feature_types=feature_types, top_ratio=top_ratio, bottom_ratio=bottom_ratio) date_periods = _get_in_out_dates( start_date=start_date, end_date=end_date, security_id='000300.XSHG') or [[start_date, end_date]] next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1) idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=next_date, source=0).get(bc) all_factor_returns = [] for _start_date, _end_date in date_periods: logger.info('get factor return: processing from {0} to {1}'.format( _start_date, _end_date)) next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2) if bc: security_ids = get_idx_cons_jy(bc, _start_date, _end_date) else: security_ids = get_security_codes() # FIXME HACK FOR TESTING security_ids = security_ids[:5] ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) ret_returns = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) ret_mv = get_market_value(security_ids=security_ids, start_date=_start_date, end_date=next_date) none_factor_dict = defaultdict() # FIXME use the future industry return, should be updated to trace back the history data by windows industry_exposure_factors = get_indust_exposures_time_series_regression( start_date=_start_date, end_date=_end_date, stock_returns=ret_returns) _industry_exposure_df = pd.DataFrame(industry_exposure_factors) _industry_exposure_df.to_csv('{0}_{1}_{2}.csv'.format( _start_date, _end_date, bc)) for date, val in ret_features.items(): daily_factors = [] daily_return = [] daily_mv = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) daily_factors.append(_val_lst) try: s_label = ret_returns.get(sec_id).get(str(date)) i_label = idx_labels.get(str(date)) label = (s_label - i_label) * 100 mv = ret_mv.get(sec_id).get(str(date)) except Exception as ex: label = np.nan logger.error( 'fail to calculate the label with error:{0}'.format( ex)) daily_return.append(label) daily_mv.append(mv) try: daily_factors = feature_preprocessing(arr=daily_factors, fill_none=True, weights=daily_mv) except Exception as ex: logger.error( 'fail in feature preprocessing with error:{0}'.format(ex)) # indus_factors = get_indust_vectors(security_ids) courtry_factors = np.ones(len(security_ids)).reshape( len(security_ids), 1) factor_returns = factor_return_regression( courtry_factors, industry_exposure_factors, daily_factors, daily_return) all_factor_returns.append(factor_returns) mean_returns = np.array(all_factor_returns).mean(axis=0) return mean_returns
def train_models(model_name='', start_date='20140603', end_date='20181231', feature_ratio=1.0, bc='000300.XSHG', feature_df=None, score_df=None, cache_df=False): ''' :param model_name: :param start_date: :param end_date: :param score_bound: (up_ratio, down_ratio), will pick the features with socre in the top up_ratio ranking and in the bottom with down_ratio :return: ''' m = Ml_Reg_Model(model_name) model_full_name = '{0}_{1}_{2}_{3}_{4}'.format(model_name, start_date, end_date, feature_ratio, bc) if not m.load_model(model_full_name): m.build_model() root = get_source_root() if not cache_df: df, score_df = train_features(start_date=start_date, end_date=end_date, bc=bc) else: df = feature_df score_df = score_df score_df = score_df.sort_values(by='score', ascending=False) _feature_names = list(score_df['feature']) _score_bound = int(len(_feature_names) * feature_ratio / 2) feature_names = list( set(_feature_names[:_score_bound + 1]).union( set(_feature_names[-_score_bound:]))) # select the features by the ic values # feature_names = get_selected_features(start_date=start_date, end_date=end_date, up_ratio=score_bound[0], # down_ratio=score_bound[1]) train_X = df[feature_names].values dataes = list(df['TRADE_DATE']) sec_ids = list(df['SECURITY_ID']) train_Y = df.iloc[:, -1] decom_ratio = float(config['feature_mining_strategy']['component_ratio']) # PCA processing n_component = min(int(len(feature_names) * decom_ratio), int(config['feature_mining_strategy']['n_component'])) pca = decomposition.PCA(n_components=n_component) # train_X = pca.fit_transform(train_X) train_X = pca.fit_transform(pd.DataFrame(train_X).fillna(method='ffill')) train_Y = train_Y.fillna(0.0) st = time.time() logger.info('start training the models') # mse_scores, r2_scores = m.train_model(train_X[:1000], train_Y[:1000]) mse_scores, r2_scores = m.train_model(train_X, train_Y) et = time.time() logger.info('complete training model with time:{0}'.format(et - st)) m.save_model(model_full_name) logger.info("Mean squared error:{0}: %0.5f - %0.5f" % (mse_scores.mean() - mse_scores.std() * 3, mse_scores.mean() + mse_scores.std() * 3)) logger.info("Mean squared error:{0}: %0.5f - %0.5f" % (r2_scores.mean() - r2_scores.std() * 3, r2_scores.mean() + r2_scores.std() * 3)) result_path = os.path.join( os.path.join((os.path.join(os.path.realpath(root), 'data')), 'results'), 'feature_model_selection.txt') with open(result_path, 'a+') as fout: # fout.write('{0}\n'.format(datetime.datetime.now().strftime(config['constants']['no_dash_datetime_format']))) fout.write('{0}\n'.format(model_full_name)) fout.write('mse: {0}\n r2_score:{1}\n'.format(str(list(mse_scores)), str(list(r2_scores)))) return mse_scores, r2_scores
def feature_models(start_date='20181101', end_date='20181131', data_source=0, feature_types=[], saved_feature=True, bc=None, top_ratio=0.25, bottom_ratio=0.2): root = get_source_root() feature_mapping = _get_source_features() feature_shape = [(1, len(item)) for item in list(feature_mapping.values())] # FIXME remove hardcode indust_shape = (1, 17) date_periods = _get_in_out_dates( start_date=start_date, end_date=end_date, security_id=bc) or [[start_date, end_date]] next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1) idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=next_date, source=0).get(bc) all_factor_returns = [] # FIXME change to load existing model m = TFMultiFactor() m.build_model(feature_shape=feature_shape, indust_shape=indust_shape) for _start_date, _end_date in date_periods: logger.info('get factor return: processing from {0} to {1}'.format( _start_date, _end_date)) _next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2) if bc: security_ids = get_idx_cons_jy(bc, _start_date, _end_date) else: security_ids = get_security_codes() # FIXME HACK FOR TESTING security_ids = security_ids[:50] ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping, start_date=_start_date, end_date=_end_date, source=data_source) ret_returns = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date) ret_mv = get_market_value(security_ids=security_ids, start_date=_start_date, end_date=_next_date) # FIXME industry vector updates industry_exposure_factors, industry_name = get_indust_exposures_corr( start_date=_start_date, end_date=_end_date, stock_returns=ret_returns, period=120, bc_returns=idx_labels) for date, val in ret_features.items(): daily_factors = [] daily_return = [] daily_mv = [] for sec_id, f_lst in val.items(): _val_lst = list(f_lst.values()) all_feature_names = list(f_lst.keys()) daily_factors.append(_val_lst) try: s_label = ret_returns.get(sec_id).get(str(date)) i_label = idx_labels.get(str(date)) label = (s_label - i_label) * 100 mv = ret_mv.get(sec_id).get(str(date)) except Exception as ex: label = np.nan logger.error( 'fail to calculate the label with error:{0}'.format( ex)) daily_return.append(label) daily_mv.append(mv) try: daily_factors = feature_preprocessing(arr=daily_factors, fill_none=True, weights=daily_mv) except Exception as ex: logger.error( 'fail in feature preprocessing with error:{0}'.format(ex)) # TODO country factors TBD courtry_factors = np.ones(len(security_ids)).reshape( len(security_ids), 1) m.train_model( daily_factors, np.array(daily_return).reshape(len(daily_return), 1), industry_exposure_factors, 2, 5, ) return