Esempio n. 1
0
    def output_detaild_result(self, position_df, return_series_df, output_prefix, output_suffix):
        merged_df = pd.merge(position_df, return_series_df, right_index=True, left_index=True)
        riskon_df = merged_df.query("ls < 0")
        riskoff_df = merged_df.query("ls > 0")
        riskon_hit_ratio = riskon_df["return"].apply(lambda x: 1 if x > 0 else 0).mean()
        riskoff_hit_ratio = riskoff_df["return"].apply(lambda x: 1 if x > 0 else 0).mean()

        date_df = pd.DataFrame(merged_df.index, columns = ['ValueDate'])
        
        riskon_df = pd.merge(date_df, riskon_df.reset_index('ValueDate'), 
                             on='ValueDate', how='left').set_index('ValueDate')
        riskoff_df = pd.merge(date_df, riskoff_df.reset_index('ValueDate'), 
                              on='ValueDate', how='left').set_index('ValueDate')

        perform_measurer = PerformanceMeasurer()
        riskon_performance = perform_measurer.create_result_summary(riskon_df[['return']].fillna(0.0))[['return']]
        riskoff_performance = perform_measurer.create_result_summary(riskoff_df[['return']].fillna(0.0))[['return']]
        riskon_performance.to_csv(os.path.join('output','{0}_em_riskon_{1}.csv'.format(output_prefix, output_suffix)))
        riskoff_performance.to_csv(os.path.join('output','{0}_em_riskoff_{1}.csv'.format(output_prefix, output_suffix)))

        riskon_df.fillna(method='ffill').to_csv(os.path.join('output', '{0}_em_riskon_series_{1}.csv'.format(output_prefix, output_suffix)))
        riskoff_df.fillna(method='ffill').to_csv(os.path.join('output', '{0}_em_riskoff_series_{1}.csv'.format(output_prefix, output_suffix)))
        
        pd.DataFrame([[riskon_hit_ratio, riskoff_hit_ratio],
                      [riskon_performance.T.MaxDD.iloc[0], riskoff_performance.T.MaxDD.iloc[0]],
                      [riskon_performance.T.AverageReturn.iloc[0], riskoff_performance.T.AverageReturn.iloc[0]],
                      [riskon_performance.T.Volatility.iloc[0], riskoff_performance.T.Volatility.iloc[0]]], 
                     index=['HitRatio', 'MaxDD', 'Return', 'Volatility'],
                     columns=['RiskOn', 'RiskOff']).to_csv(os.path.join('output', '{0}_detailed_result_{1}.csv'.format(output_prefix, output_suffix)))
Esempio n. 2
0
    def output_result(self):
        output_suffix = datetime.now().strftime('%Y%m%d%H%M%S')
        self._all_return_df.to_csv(os.path.join(self._config.output_dir,
                                                'all_return_series_{0}.csv'.format(output_suffix)))
        self._all_return_df.cumsum().to_csv(os.path.join(self._config.output_dir,
                                                'cum_return_series_{0}.csv'.format(output_suffix)))
        for alg in self._all_return_df.columns:
            perform_measurer = PerformanceMeasurer()
            perform_measurer.create_result_summary(self._all_return_df[[alg]])[[alg]]\
                .to_csv(os.path.join(self._config.output_dir,
                                     '{0}_em_performance_{1}.csv'.format(alg, output_suffix)))


        self._summarize_performance(output_suffix).to_csv(os.path.join(self._config.output_dir, 
                                                                       'total_performance_{0}.csv').format(output_suffix))
Esempio n. 3
0
    def output(self, output_prefix='normal', output_suffix=None):
        self._logger.info("Outputting Result...")
        if output_suffix is None:
            output_suffix = datetime.now().strftime('%Y%m%d%H%M%S')

        #Detailed Result
        self.output_detaild_result(self._position_df, self._return_series_df, output_prefix, output_suffix)

        #Return Series
        pd.merge(self._return_series_df, self._sign_df, right_index=True, left_index=True)\
          .to_csv(os.path.join('output', '{0}_em_reutrn_series_{1}.csv'.format(output_prefix,output_suffix)))

        #Performance Measure
        perform_measurer = PerformanceMeasurer()
        perform_measurer.create_result_summary(self._return_series_df)[['return']]\
            .to_csv(os.path.join('output','{0}_em_performance_{1}.csv'.format(output_prefix, output_suffix)))
        
        self._fc_normalized_df.to_csv(os.path.join('output','{0}_fc_normalized_{1}.csv'.format(output_prefix, output_suffix)))
        self._logger.info("Output Process Completed.")
Esempio n. 4
0
def main():
    logging.config.fileConfig('./logger_config.ini')
    logger = logging.getLogger("jpbank.quants")
    output_suffix = datetime.now().strftime('%Y%m%d%H%M%S')
    try:
        config = MLConfigParser()
        training_month = config.training_term
        training_week = 52
        exec_pca = config.exec_pca
        is_regression = config.is_regression
        feature_file_name = os.path.join(config.input_dir, config.feature_file)
        weight_file_name = os.path.join(config.input_dir, 'coint_vec.csv')

        check_input(feature_file_name)
        check_input(weight_file_name)

        logger.info("Trainig Term {0}".format(training_month))
        logger.info("Excecute PCA {0}".format(exec_pca))
        date_list = create_date_list(feature_file_name, training_month)
        port_label, notional = create_label(weight_file_name,
                                            feature_file_name, training_week,
                                            config.is_regression)
        import pdb
        pdb.set_trace()
        algorithm_list = [
            alg.ML_AR,
            #alg.ML_ARMA,
            #alg.ML_ARIMA
        ]
        #if config.is_regression:
        #    algorithm_list.append(alg.ML_NaiveBayes)
        #algorithm_list = [alg.ML_LightBGM]

        predict_result_df = pd.DataFrame()
        proba_result_df = pd.DataFrame()
        training_result_df = pd.DataFrame()
        importance_df = pd.DataFrame()
        #date_list = date_list[-3:]
        for algo in algorithm_list:
            algo_result_list = []
            proba_result_list = []
            #date_list = date_list[date_list>date(2012,12,1)]
            #date_list = date_list[-100:]
            ml_algo = algo(start_date=date_list[0] -
                           relativedelta(months=training_month),
                           end_date=date_list[-1] + relativedelta(months=1))
            #date_list = date_list[:10]
            for i in tqdm(range(len(date_list))):
                value_date = date_list[i]
                logger.info("Trainig/Predicting In {0}...".format(value_date))
                start_date = value_date - relativedelta(
                    weeks=training_week)  #months=training_month)
                #end_date = value_date-relativedelta(weeks=1)
                #import pdb;pdb.set_trace()
                logger.info("Learing In {0}".format(value_date))
                ml_algo.learn(start_date=start_date, end_date=value_date)

                algo_result_list.append([
                    value_date, ml_algo.__class__.__name__,
                    ml_algo.predict(value_date)
                ])
                gc.collect()

            alg_result_df = pd.DataFrame(
                algo_result_list,
                index=date_list,
                columns=['ValueDate', 'Algorithm', 'Predict'])

            predict_result_df = predict_result_df.append(alg_result_df)
            #f = open('./output/report_{0}_{1}.txt'.format(ml_algo.__class__.__name__,
            #                                              output_suffix), 'w')
            #common_index = list(set(port_label.index) & set(list(alg_result_df.index)))
            #if not is_regression:
            #    proba_result_df = proba_result_df.append(
            #                            pd.DataFrame(proba_result_list,
            #                                         index=date_list,
            #                                         columns=['ValueDate',
            #                                                  'Algorithm',
            #                                                  'DownProbability',
            #                                                  'UpProbability']))
            #    #import pdb;pdb.set_trace()
            #    f.write(classification_report(port_label.loc[common_index].Return,
            #                                  alg_result_df.loc[common_index].Predict))
            #else:

            #    f.write(classification_report(port_label.loc[common_index].Return.apply(lambda x: 1 if x>0 else 0),
            #                                  alg_result_df.loc[common_index].Predict.apply(lambda x: 1 if x>0 else 0)))
            #f.close()

        #Result Output Process
        predict_result_df.index.name = 'ValueDate'
        proba_result_df.index.name = 'ValueDate'
        predicted_return = pd.DataFrame((predict_result_df.Predict - ml_algo.coint_index.loc[predict_result_df.index].Price)\
                                        / notional.Notional.loc[predict_result_df.index],
                                        columns=['Predict'])
        predicted_return['Algorithm'] = predict_result_df.Algorithm.tolist()
        #import pdb;pdb.set_trace()
        predict_result_df.to_csv('./output/predict_result_{0}_{1}_{2}_{3}.csv'\
                                .format('PCA' if exec_pca else 'NoPCA',
                                        int(training_month),
                                        'Reg' if is_regression else 'Class',
                                        output_suffix),
                                 index=False)
        ml_algo.coint_index.to_csv('coint_index.csv')
        #if not is_regression:
        #    proba_result_df.to_csv('./output/proba_result_{0}_{1}_{2}_{3}.csv'\
        #                            .format('PCA' if exec_pca else 'NoPCA',
        #                                    int(training_month),
        #                                    'Reg' if is_regression else 'Class',
        #                                    output_suffix),
        #                             index=False)
        #training_result_df.to_csv('./output/training_result_{0}_{1}_{2}_{3}.csv'\
        #                        .format('PCA' if exec_pca else 'NoPCA',
        #                                int(training_month),
        #                                'Reg' if is_regression else 'Class',
        #                                output_suffix),
        #                         index=False)

        #importance_df.to_csv('./output/importance_{0}_{1}_{2}_{3}.csv'\
        #                     .format('PCA' if exec_pca else 'No',
        #                             int(training_month),
        #                             'Reg' if is_regression else 'Class',
        #                             output_suffix),
        #                     index=True)
        result_manager = ResultManager(PredictedData=predicted_return,
                                       PredictedLabel=port_label)
        #import pdb;pdb.set_trace()
        result_manager.create_result().to_csv('./output/summary_{0}_{1}_{2}_{3}.csv'\
                                               .format('PCA' if exec_pca else 'NoPCA',
                                                       int(training_month),
                                                       'Reg' if is_regression else 'Class',
                                                       output_suffix),
                                              index=False)
        return_series_df = result_manager.create_return_series()
        return_series_df.to_csv('./output/return_series_{0}_{1}_{2}_{3}.csv'\
                                               .format('PCA' if exec_pca else 'NoPCA',
                                                       int(training_month),
                                                       'Reg' if is_regression else 'Class',
                                                       output_suffix))
        perform_measurer = PerformanceMeasurer()
        perform_measurer.create_result_summary(return_series_df)\
            .to_csv('./output/performance_summary_{0}_{1}_{2}_{3}.csv'\
                                               .format('PCA' if exec_pca else 'NoPCA',
                                                       int(training_month),
                                                       'Reg' if is_regression else 'Class',
                                                       output_suffix))
        sys.exit(0)
    except InvalidFileError as ife:
        logger.error(ife.args)
    else:
        import traceback
        logger.error(traceback.format_exc())
        sys.exit(1)
Esempio n. 5
0
def main():
    logging.config.fileConfig('./logger_config.ini')
    logger = logging.getLogger("jpbank.quants")
    output_suffix = datetime.now().strftime('%Y%m%d%H%M%S')
    try:
        config = MLConfigParser()
        training_month = config.training_term
        exec_pca = config.exec_pca
        is_regression = config.is_regression
        feature_file_name = os.path.join(config.input_dir, config.feature_file)
        weight_file_name = os.path.join(config.input_dir, 'coint_vec.csv')
        
        check_input(feature_file_name)
        check_input(weight_file_name)

        
        
        logger.info("Trainig Term {0}".format(training_month))
        logger.info("Excecute PCA {0}".format(exec_pca))
        date_list = create_date_list(feature_file_name, 
                                     training_month)
        port_label = create_label(weight_file_name, feature_file_name, config.is_regression)
        #import pdb;pdb.set_trace()
        algorithm_list = [
                          #alg.ML_Adaboost, 
                          #alg.ML_Bagging, 
                          #alg.ML_GradientBoost, 
                          #alg.ML_SVM,
                          #alg.ML_RandomForest,
                          #alg.ML_LightGBM,
                          #alg.ML_XGBoost,
                          #alg.ML_HistGradientBoost,
                          #alg.ML_kNN,

                          #alg.ML_DNN,
                          #alg.ML_LSTM,
                          #alg.ML_RNN,
                          #alg.ML_GRU,
                          alg.ML_CNN,

                          #alg.ML_LinearRegression,
                          #alg.ML_RidgeRegression,
                          #alg.ML_LassoRegression,
                          #alg.ML_ElasticNet,
                          #alg.ML_BasianRegression,
                          #alg.ML_ARDRegression,
                          
                          ]
        #if config.is_regression:
        #    algorithm_list.append(alg.ML_NaiveBayes)
        #algorithm_list = [alg.ML_LightBGM]

        predict_result_df = pd.DataFrame()
        proba_result_df = pd.DataFrame()
        training_result_df = pd.DataFrame()
        importance_df = pd.DataFrame()
        #date_list = date_list[-3:]
        for algo in algorithm_list:
            algo_result_list = []
            proba_result_list = []
            #date_list = date_list[date_list>date(2012,12,1)]
            #date_list = date_list[-100:]
            ml_algo = algo(IsRegression=is_regression, with_grid_cv=config.with_grid_cv)
            prev_date = None
            for value_date in tqdm(date_list):
                logger.info("Trainig/Predicting In {0}...".format(value_date))

                #fix start date and expand training data, or roll training data
                if config.fix_start_date:
                    start_date = date_list[0] - relativedelta(months=training_month)
                else:
                    start_date = value_date - relativedelta(months=training_month)
                
                feature_manager = CointFeatureVectorManager(FilePath=feature_file_name, 
                                                            TrainingStartDate=start_date,
                                                            PredictStartDate=value_date,
                                                            PredictEndDate=value_date,
                                                            IsRegression=is_regression,
                                                            ExecPCA=exec_pca,
                                                            MaxLen=None if 'ml_time_series' not in ml_algo.__module__ 
                                                            and 'ml_cnn' not in ml_algo.__module__ else ml_algo.maxlen)
                
                #if prev_date is None or prev_date.month != value_date.month:
                logger.info("Learing In {0}".format(value_date))
                ml_algo.dispose()
                training_label = port_label.loc[feature_manager.training_data.index]
                #import pdb;pdb.set_trace()
                ml_algo.learn(training_data=feature_manager.training_data,
                                training_label=training_label,# feature_manager.training_label,
                                tunes_param=config.parameter_tuning)
                prev_date = value_date
                #if 'ml_time_series' not in ml_algo.__module__:
                training_result_df = training_result_df.append(
                                        calc_training_result(ml_algo,feature_manager.training_data,
                                                             training_label,
                                                             value_date))
                algo_result_list.append([value_date,
                                         ml_algo.__class__.__name__,
                                         ml_algo.predict_one(feature_manager.predict_data)])

                if not is_regression and ml_algo.__class__.__module__[-10:] != 'regression':
                    proba_result_list.append([value_date, ml_algo.__class__.__name__]
                                             +ml_algo.predict_one_proba(feature_manager.predict_data))

                if ml_algo.__class__.__name__ in config.importance_models:
                    importance_df = importance_df.append(create_importance(ml_algo.__class__.__name__,
                                                                           ml_algo.importance,
                                                                           feature_manager.training_data.columns,
                                                                           value_date))

                #Post Process for each week
                feature_manager.dispose()
                #ml_algo.dispose()
                #del ml_algo
                del feature_manager
                gc.collect()

            alg_result_df = pd.DataFrame(algo_result_list,
                                         index=date_list, 
                                         columns=['ValueDate',
                                                 'Algorithm',
                                                 'Predict'])
            

            predict_result_df = predict_result_df.append(alg_result_df)
            f = open('./output/report_{0}_{1}.txt'.format(ml_algo.__class__.__name__,
                                                          output_suffix), 'w')
            common_index = list(set(port_label.index) & set(list(alg_result_df.index)))
            if not is_regression:
                proba_result_df = proba_result_df.append(
                                        pd.DataFrame(proba_result_list,
                                                     index=date_list, 
                                                     columns=['ValueDate',
                                                              'Algorithm',
                                                              'DownProbability',
                                                              'UpProbability']))
                #import pdb;pdb.set_trace()
                f.write(classification_report(port_label.loc[common_index].Return,
                                              alg_result_df.loc[common_index].Predict))
            else:
                
                f.write(classification_report(port_label.loc[common_index].Return.apply(lambda x: 1 if x>0 else 0),
                                              alg_result_df.loc[common_index].Predict.apply(lambda x: 1 if x>0 else 0)))
            f.close()

        #Result Output Process
        predict_result_df.index.name='ValueDate'
        proba_result_df.index.name='ValueDate'
        predict_result_df.to_csv('./output/predict_result_{0}_{1}_{2}_{3}.csv'\
                                .format('PCA' if exec_pca else 'NoPCA',
                                        int(training_month),
                                        'Reg' if is_regression else 'Class',
                                        output_suffix), 
                                 index=False)
        if not is_regression:
            proba_result_df.to_csv('./output/proba_result_{0}_{1}_{2}_{3}.csv'\
                                    .format('PCA' if exec_pca else 'NoPCA',
                                            int(training_month),
                                            'Reg' if is_regression else 'Class',
                                            output_suffix), 
                                     index=False)
        training_result_df.to_csv('./output/training_result_{0}_{1}_{2}_{3}.csv'\
                                .format('PCA' if exec_pca else 'NoPCA',
                                        int(training_month),
                                        'Reg' if is_regression else 'Class',
                                        output_suffix), 
                                 index=False)
    
        importance_df.to_csv('./output/importance_{0}_{1}_{2}_{3}.csv'\
                             .format('PCA' if exec_pca else 'No',
                                     int(training_month),
                                     'Reg' if is_regression else 'Class',
                                     output_suffix), 
                             index=True)
        result_manager = ResultManager(PredictedData=predict_result_df,
                                       PredictedLabel=port_label)
        result_manager.create_result().to_csv('./output/summary_{0}_{1}_{2}_{3}.csv'\
                                               .format('PCA' if exec_pca else 'NoPCA',
                                                       int(training_month),
                                                       'Reg' if is_regression else 'Class',
                                                       output_suffix), 
                                              index=False)
        return_series_df = result_manager.create_return_series()
        return_series_df.to_csv('./output/return_series_{0}_{1}_{2}_{3}.csv'\
                                               .format('PCA' if exec_pca else 'NoPCA',
                                                       int(training_month),
                                                       'Reg' if is_regression else 'Class',
                                                       output_suffix))
        perform_measurer = PerformanceMeasurer()
        perform_measurer.create_result_summary(return_series_df)\
            .to_csv('./output/performance_summary_{0}_{1}_{2}_{3}.csv'\
                                               .format('PCA' if exec_pca else 'NoPCA',
                                                       int(training_month),
                                                       'Reg' if is_regression else 'Class',
                                                       output_suffix))
        sys.exit(0)
    except InvalidFileError as ife:
        logger.error(ife.args)
    else:
        import traceback
        logger.error(traceback.format_exc())
        sys.exit(1)
Esempio n. 6
0
    def simulate(self):
        self._logger.info("Simulation Starting...")
        rate_return_df = self._calc_return(
            self._price_df[self._em_rate_tickers].loc[self._date_list])
        fc_diff_df = self._price_df[self._fc_tickers].loc[
            self._date_list].diff().dropna(axis=0)
        src_return_df = pd.merge(rate_return_df,
                                 fc_diff_df,
                                 right_index=True,
                                 left_index=True)
        normalized_df = pd.DataFrame(
            [[
                self._normalize(src_return_df[ticker], value_date)
                for value_date in self._date_list[1:]
            ] for ticker in self._em_rate_tickers + self._fc_tickers],
            index=self._em_rate_tickers + self._fc_tickers,
            columns=self._date_list[1:]).T.dropna(axis=0)
        if self._exp_return_file is None:
            self._logger.info("Selecting EM Currency Tickers usgin Rate")
            em_prior_tickers = pd.DataFrame(
                [(self._em_rate_price_dic[normalized_df[
                    self._em_rate_tickers].iloc[i].idxmax()],
                  self._em_rate_price_dic[normalized_df[
                      self._em_rate_tickers].iloc[i].idxmin()])
                 for i in range(normalized_df.shape[0])],
                index=normalized_df.index,
                columns=['best', 'worst'])
        else:
            self._logger.info(
                "Selecting EM Currency Tickers usgin Expected Return")
            exp_return_df = pd.read_csv(self._exp_return_file)
            #import pdb;pdb.set_trace()
            exp_return_df = cf.convert_date_format(
                exp_return_df, target_col='ValueDate').set_index('ValueDate')
            em_prior_tickers = pd.DataFrame(
                [(exp_return_df[self._price_tickers].iloc[i].idxmax(),
                  exp_return_df[self._price_tickers].iloc[i].idxmin())
                 for i in range(exp_return_df.shape[0])],
                index=exp_return_df.index,
                columns=['best', 'worst'])

        if self._has_indication_diff:  #one week delay, like Chicago FC
            fc_prior_tickers = pd.DataFrame([False] + normalized_df[self._fc_tickers[0]].iloc[:-1]\
                                                    .apply(lambda x: True if x < self._fc_threshold else False).tolist(),
                                            index = normalized_df.index,
                                            columns = ['fc_priority'])
        else:
            fc_prior_tickers = pd.DataFrame(normalized_df[self._fc_tickers[0]]\
                                           .apply(lambda x: True if x < self._fc_threshold else False).tolist(),
                                            index = normalized_df.index,
                                            columns = ['fc_priority'])

        sign_df = pd.merge(em_prior_tickers,
                           fc_prior_tickers,
                           right_index=True,
                           left_index=True)

        self._logger.info("Building Position...")
        #Risk On: Long EM Ccy of Worst Score ->Position: -1(USD Short, EM Long)
        #of Worst
        #Risk OFF: Short EM Ccy of Best Score ->Position: 1(USD Long, EM Short)
        #of Best
        position_df = pd.DataFrame(
            [(sign_df.iloc[i]['worst'],
              -1.0) if sign_df.iloc[i]['fc_priority'] else
             (sign_df.iloc[i]['best'], 1.0) for i in range(sign_df.shape[0])],
            index=sign_df.index,
            columns=['ccy', 'ls'])
        position_df.index.name = 'ValueDate'
        if self._includes_swap:
            price_return_df = self._calc_return_inc_swap(
                self._price_df[self._price_tickers + [
                    self._em_price_fwd_dic[k]
                    for k in self._em_price_fwd_dic.keys()
                ]].loc[self._date_list], self._price_tickers,
                self._em_price_fwd_dic).loc[position_df.index]
        else:
            price_return_df = self._calc_return(
                self._price_df[self._price_tickers].loc[self._date_list],
                with_log=True).loc[position_df.index]

        self._logger.info("Calculating Perofrmance...")
        return_series_df = pd.DataFrame([
            price_return_df[position_df.iloc[i][0]].iloc[i + 1] *
            position_df.iloc[i][1] for i in range(position_df.shape[0] - 1)
        ],
                                        index=position_df.index[:-1],
                                        columns=['return'])
        return_series_df.index.name = 'ValueDate'
        return_series_df['cum_return'] = return_series_df['return'].cumsum()
        #import pdb;pdb.set_trace()

        #output result
        output_suffix = datetime.now().strftime('%Y%m%d%H%M%S')
        self.output_detaild_result(position_df, return_series_df,
                                   output_suffix)
        pd.merge(return_series_df, sign_df, right_index=True, left_index=True)\
          .to_csv(os.path.join('output', 'em_reutrn_series_{0}.csv'.format(output_suffix)))
        perform_measurer = PerformanceMeasurer()
        #perform_measurer.create_result_summary(return_series_df['return']).to_csv('em_performance.csv')
        perform_measurer.create_result_summary(return_series_df)[['return']]\
            .to_csv(os.path.join('output','em_performance_{0}.csv'.format(output_suffix)))

        self._logger.info("Simulation Complated.")