def output_detaild_result(self, position_df, return_series_df, output_prefix, output_suffix): merged_df = pd.merge(position_df, return_series_df, right_index=True, left_index=True) riskon_df = merged_df.query("ls < 0") riskoff_df = merged_df.query("ls > 0") riskon_hit_ratio = riskon_df["return"].apply(lambda x: 1 if x > 0 else 0).mean() riskoff_hit_ratio = riskoff_df["return"].apply(lambda x: 1 if x > 0 else 0).mean() date_df = pd.DataFrame(merged_df.index, columns = ['ValueDate']) riskon_df = pd.merge(date_df, riskon_df.reset_index('ValueDate'), on='ValueDate', how='left').set_index('ValueDate') riskoff_df = pd.merge(date_df, riskoff_df.reset_index('ValueDate'), on='ValueDate', how='left').set_index('ValueDate') perform_measurer = PerformanceMeasurer() riskon_performance = perform_measurer.create_result_summary(riskon_df[['return']].fillna(0.0))[['return']] riskoff_performance = perform_measurer.create_result_summary(riskoff_df[['return']].fillna(0.0))[['return']] riskon_performance.to_csv(os.path.join('output','{0}_em_riskon_{1}.csv'.format(output_prefix, output_suffix))) riskoff_performance.to_csv(os.path.join('output','{0}_em_riskoff_{1}.csv'.format(output_prefix, output_suffix))) riskon_df.fillna(method='ffill').to_csv(os.path.join('output', '{0}_em_riskon_series_{1}.csv'.format(output_prefix, output_suffix))) riskoff_df.fillna(method='ffill').to_csv(os.path.join('output', '{0}_em_riskoff_series_{1}.csv'.format(output_prefix, output_suffix))) pd.DataFrame([[riskon_hit_ratio, riskoff_hit_ratio], [riskon_performance.T.MaxDD.iloc[0], riskoff_performance.T.MaxDD.iloc[0]], [riskon_performance.T.AverageReturn.iloc[0], riskoff_performance.T.AverageReturn.iloc[0]], [riskon_performance.T.Volatility.iloc[0], riskoff_performance.T.Volatility.iloc[0]]], index=['HitRatio', 'MaxDD', 'Return', 'Volatility'], columns=['RiskOn', 'RiskOff']).to_csv(os.path.join('output', '{0}_detailed_result_{1}.csv'.format(output_prefix, output_suffix)))
def output_result(self): output_suffix = datetime.now().strftime('%Y%m%d%H%M%S') self._all_return_df.to_csv(os.path.join(self._config.output_dir, 'all_return_series_{0}.csv'.format(output_suffix))) self._all_return_df.cumsum().to_csv(os.path.join(self._config.output_dir, 'cum_return_series_{0}.csv'.format(output_suffix))) for alg in self._all_return_df.columns: perform_measurer = PerformanceMeasurer() perform_measurer.create_result_summary(self._all_return_df[[alg]])[[alg]]\ .to_csv(os.path.join(self._config.output_dir, '{0}_em_performance_{1}.csv'.format(alg, output_suffix))) self._summarize_performance(output_suffix).to_csv(os.path.join(self._config.output_dir, 'total_performance_{0}.csv').format(output_suffix))
def output(self, output_prefix='normal', output_suffix=None): self._logger.info("Outputting Result...") if output_suffix is None: output_suffix = datetime.now().strftime('%Y%m%d%H%M%S') #Detailed Result self.output_detaild_result(self._position_df, self._return_series_df, output_prefix, output_suffix) #Return Series pd.merge(self._return_series_df, self._sign_df, right_index=True, left_index=True)\ .to_csv(os.path.join('output', '{0}_em_reutrn_series_{1}.csv'.format(output_prefix,output_suffix))) #Performance Measure perform_measurer = PerformanceMeasurer() perform_measurer.create_result_summary(self._return_series_df)[['return']]\ .to_csv(os.path.join('output','{0}_em_performance_{1}.csv'.format(output_prefix, output_suffix))) self._fc_normalized_df.to_csv(os.path.join('output','{0}_fc_normalized_{1}.csv'.format(output_prefix, output_suffix))) self._logger.info("Output Process Completed.")
def main(): logging.config.fileConfig('./logger_config.ini') logger = logging.getLogger("jpbank.quants") output_suffix = datetime.now().strftime('%Y%m%d%H%M%S') try: config = MLConfigParser() training_month = config.training_term training_week = 52 exec_pca = config.exec_pca is_regression = config.is_regression feature_file_name = os.path.join(config.input_dir, config.feature_file) weight_file_name = os.path.join(config.input_dir, 'coint_vec.csv') check_input(feature_file_name) check_input(weight_file_name) logger.info("Trainig Term {0}".format(training_month)) logger.info("Excecute PCA {0}".format(exec_pca)) date_list = create_date_list(feature_file_name, training_month) port_label, notional = create_label(weight_file_name, feature_file_name, training_week, config.is_regression) import pdb pdb.set_trace() algorithm_list = [ alg.ML_AR, #alg.ML_ARMA, #alg.ML_ARIMA ] #if config.is_regression: # algorithm_list.append(alg.ML_NaiveBayes) #algorithm_list = [alg.ML_LightBGM] predict_result_df = pd.DataFrame() proba_result_df = pd.DataFrame() training_result_df = pd.DataFrame() importance_df = pd.DataFrame() #date_list = date_list[-3:] for algo in algorithm_list: algo_result_list = [] proba_result_list = [] #date_list = date_list[date_list>date(2012,12,1)] #date_list = date_list[-100:] ml_algo = algo(start_date=date_list[0] - relativedelta(months=training_month), end_date=date_list[-1] + relativedelta(months=1)) #date_list = date_list[:10] for i in tqdm(range(len(date_list))): value_date = date_list[i] logger.info("Trainig/Predicting In {0}...".format(value_date)) start_date = value_date - relativedelta( weeks=training_week) #months=training_month) #end_date = value_date-relativedelta(weeks=1) #import pdb;pdb.set_trace() logger.info("Learing In {0}".format(value_date)) ml_algo.learn(start_date=start_date, end_date=value_date) algo_result_list.append([ value_date, ml_algo.__class__.__name__, ml_algo.predict(value_date) ]) gc.collect() alg_result_df = pd.DataFrame( algo_result_list, index=date_list, columns=['ValueDate', 'Algorithm', 'Predict']) predict_result_df = predict_result_df.append(alg_result_df) #f = open('./output/report_{0}_{1}.txt'.format(ml_algo.__class__.__name__, # output_suffix), 'w') #common_index = list(set(port_label.index) & set(list(alg_result_df.index))) #if not is_regression: # proba_result_df = proba_result_df.append( # pd.DataFrame(proba_result_list, # index=date_list, # columns=['ValueDate', # 'Algorithm', # 'DownProbability', # 'UpProbability'])) # #import pdb;pdb.set_trace() # f.write(classification_report(port_label.loc[common_index].Return, # alg_result_df.loc[common_index].Predict)) #else: # f.write(classification_report(port_label.loc[common_index].Return.apply(lambda x: 1 if x>0 else 0), # alg_result_df.loc[common_index].Predict.apply(lambda x: 1 if x>0 else 0))) #f.close() #Result Output Process predict_result_df.index.name = 'ValueDate' proba_result_df.index.name = 'ValueDate' predicted_return = pd.DataFrame((predict_result_df.Predict - ml_algo.coint_index.loc[predict_result_df.index].Price)\ / notional.Notional.loc[predict_result_df.index], columns=['Predict']) predicted_return['Algorithm'] = predict_result_df.Algorithm.tolist() #import pdb;pdb.set_trace() predict_result_df.to_csv('./output/predict_result_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=False) ml_algo.coint_index.to_csv('coint_index.csv') #if not is_regression: # proba_result_df.to_csv('./output/proba_result_{0}_{1}_{2}_{3}.csv'\ # .format('PCA' if exec_pca else 'NoPCA', # int(training_month), # 'Reg' if is_regression else 'Class', # output_suffix), # index=False) #training_result_df.to_csv('./output/training_result_{0}_{1}_{2}_{3}.csv'\ # .format('PCA' if exec_pca else 'NoPCA', # int(training_month), # 'Reg' if is_regression else 'Class', # output_suffix), # index=False) #importance_df.to_csv('./output/importance_{0}_{1}_{2}_{3}.csv'\ # .format('PCA' if exec_pca else 'No', # int(training_month), # 'Reg' if is_regression else 'Class', # output_suffix), # index=True) result_manager = ResultManager(PredictedData=predicted_return, PredictedLabel=port_label) #import pdb;pdb.set_trace() result_manager.create_result().to_csv('./output/summary_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=False) return_series_df = result_manager.create_return_series() return_series_df.to_csv('./output/return_series_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix)) perform_measurer = PerformanceMeasurer() perform_measurer.create_result_summary(return_series_df)\ .to_csv('./output/performance_summary_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix)) sys.exit(0) except InvalidFileError as ife: logger.error(ife.args) else: import traceback logger.error(traceback.format_exc()) sys.exit(1)
def main(): logging.config.fileConfig('./logger_config.ini') logger = logging.getLogger("jpbank.quants") output_suffix = datetime.now().strftime('%Y%m%d%H%M%S') try: config = MLConfigParser() training_month = config.training_term exec_pca = config.exec_pca is_regression = config.is_regression feature_file_name = os.path.join(config.input_dir, config.feature_file) weight_file_name = os.path.join(config.input_dir, 'coint_vec.csv') check_input(feature_file_name) check_input(weight_file_name) logger.info("Trainig Term {0}".format(training_month)) logger.info("Excecute PCA {0}".format(exec_pca)) date_list = create_date_list(feature_file_name, training_month) port_label = create_label(weight_file_name, feature_file_name, config.is_regression) #import pdb;pdb.set_trace() algorithm_list = [ #alg.ML_Adaboost, #alg.ML_Bagging, #alg.ML_GradientBoost, #alg.ML_SVM, #alg.ML_RandomForest, #alg.ML_LightGBM, #alg.ML_XGBoost, #alg.ML_HistGradientBoost, #alg.ML_kNN, #alg.ML_DNN, #alg.ML_LSTM, #alg.ML_RNN, #alg.ML_GRU, alg.ML_CNN, #alg.ML_LinearRegression, #alg.ML_RidgeRegression, #alg.ML_LassoRegression, #alg.ML_ElasticNet, #alg.ML_BasianRegression, #alg.ML_ARDRegression, ] #if config.is_regression: # algorithm_list.append(alg.ML_NaiveBayes) #algorithm_list = [alg.ML_LightBGM] predict_result_df = pd.DataFrame() proba_result_df = pd.DataFrame() training_result_df = pd.DataFrame() importance_df = pd.DataFrame() #date_list = date_list[-3:] for algo in algorithm_list: algo_result_list = [] proba_result_list = [] #date_list = date_list[date_list>date(2012,12,1)] #date_list = date_list[-100:] ml_algo = algo(IsRegression=is_regression, with_grid_cv=config.with_grid_cv) prev_date = None for value_date in tqdm(date_list): logger.info("Trainig/Predicting In {0}...".format(value_date)) #fix start date and expand training data, or roll training data if config.fix_start_date: start_date = date_list[0] - relativedelta(months=training_month) else: start_date = value_date - relativedelta(months=training_month) feature_manager = CointFeatureVectorManager(FilePath=feature_file_name, TrainingStartDate=start_date, PredictStartDate=value_date, PredictEndDate=value_date, IsRegression=is_regression, ExecPCA=exec_pca, MaxLen=None if 'ml_time_series' not in ml_algo.__module__ and 'ml_cnn' not in ml_algo.__module__ else ml_algo.maxlen) #if prev_date is None or prev_date.month != value_date.month: logger.info("Learing In {0}".format(value_date)) ml_algo.dispose() training_label = port_label.loc[feature_manager.training_data.index] #import pdb;pdb.set_trace() ml_algo.learn(training_data=feature_manager.training_data, training_label=training_label,# feature_manager.training_label, tunes_param=config.parameter_tuning) prev_date = value_date #if 'ml_time_series' not in ml_algo.__module__: training_result_df = training_result_df.append( calc_training_result(ml_algo,feature_manager.training_data, training_label, value_date)) algo_result_list.append([value_date, ml_algo.__class__.__name__, ml_algo.predict_one(feature_manager.predict_data)]) if not is_regression and ml_algo.__class__.__module__[-10:] != 'regression': proba_result_list.append([value_date, ml_algo.__class__.__name__] +ml_algo.predict_one_proba(feature_manager.predict_data)) if ml_algo.__class__.__name__ in config.importance_models: importance_df = importance_df.append(create_importance(ml_algo.__class__.__name__, ml_algo.importance, feature_manager.training_data.columns, value_date)) #Post Process for each week feature_manager.dispose() #ml_algo.dispose() #del ml_algo del feature_manager gc.collect() alg_result_df = pd.DataFrame(algo_result_list, index=date_list, columns=['ValueDate', 'Algorithm', 'Predict']) predict_result_df = predict_result_df.append(alg_result_df) f = open('./output/report_{0}_{1}.txt'.format(ml_algo.__class__.__name__, output_suffix), 'w') common_index = list(set(port_label.index) & set(list(alg_result_df.index))) if not is_regression: proba_result_df = proba_result_df.append( pd.DataFrame(proba_result_list, index=date_list, columns=['ValueDate', 'Algorithm', 'DownProbability', 'UpProbability'])) #import pdb;pdb.set_trace() f.write(classification_report(port_label.loc[common_index].Return, alg_result_df.loc[common_index].Predict)) else: f.write(classification_report(port_label.loc[common_index].Return.apply(lambda x: 1 if x>0 else 0), alg_result_df.loc[common_index].Predict.apply(lambda x: 1 if x>0 else 0))) f.close() #Result Output Process predict_result_df.index.name='ValueDate' proba_result_df.index.name='ValueDate' predict_result_df.to_csv('./output/predict_result_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=False) if not is_regression: proba_result_df.to_csv('./output/proba_result_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=False) training_result_df.to_csv('./output/training_result_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=False) importance_df.to_csv('./output/importance_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'No', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=True) result_manager = ResultManager(PredictedData=predict_result_df, PredictedLabel=port_label) result_manager.create_result().to_csv('./output/summary_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix), index=False) return_series_df = result_manager.create_return_series() return_series_df.to_csv('./output/return_series_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix)) perform_measurer = PerformanceMeasurer() perform_measurer.create_result_summary(return_series_df)\ .to_csv('./output/performance_summary_{0}_{1}_{2}_{3}.csv'\ .format('PCA' if exec_pca else 'NoPCA', int(training_month), 'Reg' if is_regression else 'Class', output_suffix)) sys.exit(0) except InvalidFileError as ife: logger.error(ife.args) else: import traceback logger.error(traceback.format_exc()) sys.exit(1)
def simulate(self): self._logger.info("Simulation Starting...") rate_return_df = self._calc_return( self._price_df[self._em_rate_tickers].loc[self._date_list]) fc_diff_df = self._price_df[self._fc_tickers].loc[ self._date_list].diff().dropna(axis=0) src_return_df = pd.merge(rate_return_df, fc_diff_df, right_index=True, left_index=True) normalized_df = pd.DataFrame( [[ self._normalize(src_return_df[ticker], value_date) for value_date in self._date_list[1:] ] for ticker in self._em_rate_tickers + self._fc_tickers], index=self._em_rate_tickers + self._fc_tickers, columns=self._date_list[1:]).T.dropna(axis=0) if self._exp_return_file is None: self._logger.info("Selecting EM Currency Tickers usgin Rate") em_prior_tickers = pd.DataFrame( [(self._em_rate_price_dic[normalized_df[ self._em_rate_tickers].iloc[i].idxmax()], self._em_rate_price_dic[normalized_df[ self._em_rate_tickers].iloc[i].idxmin()]) for i in range(normalized_df.shape[0])], index=normalized_df.index, columns=['best', 'worst']) else: self._logger.info( "Selecting EM Currency Tickers usgin Expected Return") exp_return_df = pd.read_csv(self._exp_return_file) #import pdb;pdb.set_trace() exp_return_df = cf.convert_date_format( exp_return_df, target_col='ValueDate').set_index('ValueDate') em_prior_tickers = pd.DataFrame( [(exp_return_df[self._price_tickers].iloc[i].idxmax(), exp_return_df[self._price_tickers].iloc[i].idxmin()) for i in range(exp_return_df.shape[0])], index=exp_return_df.index, columns=['best', 'worst']) if self._has_indication_diff: #one week delay, like Chicago FC fc_prior_tickers = pd.DataFrame([False] + normalized_df[self._fc_tickers[0]].iloc[:-1]\ .apply(lambda x: True if x < self._fc_threshold else False).tolist(), index = normalized_df.index, columns = ['fc_priority']) else: fc_prior_tickers = pd.DataFrame(normalized_df[self._fc_tickers[0]]\ .apply(lambda x: True if x < self._fc_threshold else False).tolist(), index = normalized_df.index, columns = ['fc_priority']) sign_df = pd.merge(em_prior_tickers, fc_prior_tickers, right_index=True, left_index=True) self._logger.info("Building Position...") #Risk On: Long EM Ccy of Worst Score ->Position: -1(USD Short, EM Long) #of Worst #Risk OFF: Short EM Ccy of Best Score ->Position: 1(USD Long, EM Short) #of Best position_df = pd.DataFrame( [(sign_df.iloc[i]['worst'], -1.0) if sign_df.iloc[i]['fc_priority'] else (sign_df.iloc[i]['best'], 1.0) for i in range(sign_df.shape[0])], index=sign_df.index, columns=['ccy', 'ls']) position_df.index.name = 'ValueDate' if self._includes_swap: price_return_df = self._calc_return_inc_swap( self._price_df[self._price_tickers + [ self._em_price_fwd_dic[k] for k in self._em_price_fwd_dic.keys() ]].loc[self._date_list], self._price_tickers, self._em_price_fwd_dic).loc[position_df.index] else: price_return_df = self._calc_return( self._price_df[self._price_tickers].loc[self._date_list], with_log=True).loc[position_df.index] self._logger.info("Calculating Perofrmance...") return_series_df = pd.DataFrame([ price_return_df[position_df.iloc[i][0]].iloc[i + 1] * position_df.iloc[i][1] for i in range(position_df.shape[0] - 1) ], index=position_df.index[:-1], columns=['return']) return_series_df.index.name = 'ValueDate' return_series_df['cum_return'] = return_series_df['return'].cumsum() #import pdb;pdb.set_trace() #output result output_suffix = datetime.now().strftime('%Y%m%d%H%M%S') self.output_detaild_result(position_df, return_series_df, output_suffix) pd.merge(return_series_df, sign_df, right_index=True, left_index=True)\ .to_csv(os.path.join('output', 'em_reutrn_series_{0}.csv'.format(output_suffix))) perform_measurer = PerformanceMeasurer() #perform_measurer.create_result_summary(return_series_df['return']).to_csv('em_performance.csv') perform_measurer.create_result_summary(return_series_df)[['return']]\ .to_csv(os.path.join('output','em_performance_{0}.csv'.format(output_suffix))) self._logger.info("Simulation Complated.")