Ejemplo n.º 1
0
                isModelTrained = os.path.isfile(f'models/{bin_model}_trained')
                print(f'isModelTrained {bin_model}: {isModelTrained}')
                if TRAIN and not isModelTrained:
                    print(
                        f'Number of candidates ({split}) for training in {ct_bins[0]} <= ct < {ct_bins[1]} cm: {len(train_test_data[0])}'
                    )
                    print(
                        f'signal candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}'
                    )
                    model_hdl.train_test_model(train_test_data,
                                               return_prediction=True)
                    model_file_name = str(f'models/{bin_model}_trained')
                    if OPTIMIZE:
                        model_file_name = str(
                            f'models/{bin_model}_optimized_trained')
                    model_hdl.dump_model_handler(model_file_name)
                elif COMPUTE_SCORES_FROM_EFF and isModelTrained:
                    print('Model trained...')
                    if OPTIMIZED:
                        model_hdl.load_model_handler(
                            f'models/{bin_model}_trained')
                    else:
                        model_hdl.load_model_handler(
                            f'models/{bin_model}_trained')
                else:
                    continue

                ct_bins_df_index = int(ct_bins[0] / 5 - 1)
                for ct_bins_df in zip(
                        CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][:-1],
                        CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][1:]):
def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin):  #pylint: disable=too-many-statements, too-many-branches
    '''
    function for model training and testing
    '''
    n_classes = len(np.unique(TrainTestData[3]))
    modelClf = xgb.XGBClassifier(use_label_encoder=False)
    TrainCols = inputCfg['ml']['training_columns']
    HyperPars = inputCfg['ml']['hyper_par'][iBin]
    if not isinstance(TrainCols, list):
        print('\033[91mERROR: training columns must be defined!\033[0m')
        sys.exit()
    if not isinstance(HyperPars, dict):
        print(
            '\033[91mERROR: hyper-parameters must be defined or be an empty dict!\033[0m'
        )
        sys.exit()
    ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars)

    # hyperparams optimization
    if inputCfg['ml']['hyper_par_opt']['do_hyp_opt']:
        print('Perform bayesian optimization')

        BayesOptConfig = inputCfg['ml']['hyper_par_opt']['bayes_opt_config']
        if not isinstance(BayesOptConfig, dict):
            print('\033[91mERROR: bayes_opt_config must be defined!\033[0m')
            sys.exit()

        if n_classes > 2:
            average_method = inputCfg['ml']['roc_auc_average']
            roc_method = inputCfg['ml']['roc_auc_approach']
            if not (average_method in ['macro', 'weighted']
                    and roc_method in ['ovo', 'ovr']):
                print(
                    '\033[91mERROR: selected ROC configuration is not valid!\033[0m'
                )
                sys.exit()

            if average_method == 'weighted':
                metric = f'roc_auc_{roc_method}_{average_method}'
            else:
                metric = f'roc_auc_{roc_method}'
        else:
            metric = 'roc_auc'

        print('Performing hyper-parameters optimisation: ...', end='\r')
        OutFileHypPars = open(
            f'{OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt', 'wt')
        sys.stdout = OutFileHypPars
        ModelHandl.optimize_params_bayes(
            TrainTestData,
            BayesOptConfig,
            metric,
            nfold=inputCfg['ml']['hyper_par_opt']['nfolds'],
            init_points=inputCfg['ml']['hyper_par_opt']['initpoints'],
            n_iter=inputCfg['ml']['hyper_par_opt']['niter'],
            njobs=inputCfg['ml']['hyper_par_opt']['njobs'])
        OutFileHypPars.close()
        sys.stdout = sys.__stdout__
        print('Performing hyper-parameters optimisation: Done!')
        print(
            f'Output saved in {OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt'
        )
        print(f'Best hyper-parameters:\n{ModelHandl.get_model_params()}')
    else:
        ModelHandl.set_model_params(HyperPars)

    # train and test the model with the updated hyper-parameters
    yPredTest = ModelHandl.train_test_model(
        TrainTestData,
        True,
        output_margin=inputCfg['ml']['raw_output'],
        average=inputCfg['ml']['roc_auc_average'],
        multi_class_opt=inputCfg['ml']['roc_auc_approach'])
    yPredTrain = ModelHandl.predict(TrainTestData[0],
                                    inputCfg['ml']['raw_output'])

    # save model handler in pickle
    ModelHandl.dump_model_handler(
        f'{OutPutDirPt}/ModelHandler_pT_{PtBin[0]}_{PtBin[1]}.pickle')
    ModelHandl.dump_original_model(
        f'{OutPutDirPt}/XGBoostModel_pT_{PtBin[0]}_{PtBin[1]}.model', True)

    #plots
    LegLabels = [
        inputCfg['output']['leg_labels']['Bkg'],
        inputCfg['output']['leg_labels']['Prompt']
    ]
    if inputCfg['output']['leg_labels']['FD'] is not None:
        LegLabels.append(inputCfg['output']['leg_labels']['FD'])
    OutputLabels = [
        inputCfg['output']['out_labels']['Bkg'],
        inputCfg['output']['out_labels']['Prompt']
    ]
    if inputCfg['output']['out_labels']['FD'] is not None:
        OutputLabels.append(inputCfg['output']['out_labels']['FD'])
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 7)
    MLOutputFig = plot_utils.plot_output_train_test(
        ModelHandl,
        TrainTestData,
        80,
        inputCfg['ml']['raw_output'],
        LegLabels,
        inputCfg['plots']['train_test_log'],
        density=True)
    if n_classes > 2:
        for Fig, Lab in zip(MLOutputFig, OutputLabels):
            Fig.savefig(
                f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf'
            )
    else:
        MLOutputFig.savefig(
            f'{OutPutDirPt}/MLOutputDistr_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 9)
    ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, None,
                                      LegLabels,
                                      inputCfg['ml']['roc_auc_average'],
                                      inputCfg['ml']['roc_auc_approach'])
    ROCCurveFig.savefig(
        f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    pickle.dump(
        ROCCurveFig,
        open(f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pkl', 'wb'))
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 9)
    ROCCurveTTFig = plot_utils.plot_roc_train_test(
        TrainTestData[3], yPredTest, TrainTestData[1], yPredTrain, None,
        LegLabels, inputCfg['ml']['roc_auc_average'],
        inputCfg['ml']['roc_auc_approach'])
    ROCCurveTTFig.savefig(
        f'{OutPutDirPt}/ROCCurveTrainTest_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    #_____________________________________________
    PrecisionRecallFig = plot_utils.plot_precision_recall(
        TrainTestData[3], yPredTest, LegLabels)
    PrecisionRecallFig.savefig(
        f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtBin[0]}_{PtBin[1]}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (12, 7)
    FeaturesImportanceFig = plot_utils.plot_feature_imp(
        TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl, LegLabels)
    n_plot = n_classes if n_classes > 2 else 1
    for iFig, Fig in enumerate(FeaturesImportanceFig):
        if iFig < n_plot:
            label = OutputLabels[iFig] if n_classes > 2 else ''
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportance{label}_pT_{PtBin[0]}_{PtBin[1]}.pdf'
            )
        else:
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtBin[0]}_{PtBin[1]}.pdf'
            )

    return ModelHandl
class Optimiserhipe4mltree:
    # Class Attribute
    species = "optimiser_hipe4mltree"

    def __init__(self, data_param, binmin, binmax, training_var, bkg_sel,
                 hyper_pars):

        self.logger = get_logger()

        # directory
        #self.do_mlprefilter = datap.get("doml_asprefilter", None)
        self.dirmlout = data_param["ml"]["mlout"]
        self.dirmlplot = data_param["ml"]["mlplot"]
        #if self.do_mlprefilter is True:
        #    self.dirmodel = self.dirmodel + "/prefilter"
        #    self.dirmlplot = self.dirmlplot + "/prefilter"
        #if self.do_mlprefilter is False:
        #    self.dirmodel = self.dirmodel + "/analysis"
        #    self.dirmlplot = self.dirmlplot + "/analysis"

        self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root"
        self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root"
        self.v_train = None
        self.p_binmin = binmin
        self.p_binmax = binmax

        self.s_selsigml = ""
        self.s_selbkgml = bkg_sel  #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00"
        self.v_bkgoversigfrac = 3
        self.v_sig = 1
        self.v_bkg = 0
        self.rnd_splt = data_param["ml"]["rnd_splt"]
        self.test_frac = data_param["ml"]["test_frac"]

        self.prompthandler = None
        self.datahandler = None
        self.bkghandler = None
        self.traintestdata = None
        self.ypredtrain_hipe4ml = None
        self.ypredtest_hipe4ml = None

        self.preparesample()

        self.p_hipe4ml_model = None
        self.v_hipe4ml_pars = hyper_pars
        self.load_hipe4mlmodel()

        self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][
            "bayes_opt_config"]
        self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"]
        self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"]
        self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"]
        self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"]
        self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"]
        self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"]
        self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"]
        self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"]

        self.multiclass_labels = data_param["ml"].get("multiclass_labels",
                                                      None)

        self.logger.info("Using the following training variables: %s",
                         self.v_train)

    def preparesample(self):
        self.logger.info("Prepare Sample for hipe4ml")

        self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus')
        nsigcand = self.signalhandler.get_n_cand()
        self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus')
        self.bkghandler = self.datahandler.get_subset(self.s_selbkgml,
                                                      size=nsigcand *
                                                      self.v_bkgoversigfrac)
        self.traintestdata = train_test_generator(
            [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg],
            test_size=self.test_frac,
            random_state=self.rnd_splt)

    def load_hipe4mlmodel(self):
        self.logger.info("Loading hipe4ml model")
        self.v_train = self.signalhandler.get_var_names()
        self.v_train.remove('inv_mass')
        self.v_train.remove('pt_cand')

        model_xgboost = xgb.XGBClassifier()
        self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train)

    def set_hipe4ml_modelpar(self):
        self.logger.info("Setting hipe4ml hyperparameters")
        self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars)

    def do_hipe4mlhyperparopti(self):
        self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)")

        if not (self.average_method_hipe4ml in ['macro', 'weighted']
                and self.roc_method_hipe4ml in ['ovo', 'ovr']):
            self.logger.fatal("Selected ROC configuration is not valid!")

        if self.average_method_hipe4ml == 'weighted':
            metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}'
        else:
            metric = f'roc_auc_{self.roc_method_hipe4ml}'

        hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt'
        outfilehyppars = open(hypparsfile, 'wt')
        sys.stdout = outfilehyppars
        self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata,
                                                   self.bayesoptconfig_hipe4ml,
                                                   metric, self.nfold_hipe4ml,
                                                   self.init_points,
                                                   self.n_iter_hipe4ml,
                                                   self.njobs_hipe4ml)
        outfilehyppars.close()
        sys.stdout = sys.__stdout__
        self.logger.info("Performing hyper-parameters optimisation: Done!")

    def do_hipe4mltrain(self):
        self.logger.info("Training + testing hipe4ml model")
        t0 = time.time()

        self.p_hipe4ml_model.train_test_model(self.traintestdata,
                                              self.average_method_hipe4ml,
                                              self.roc_method_hipe4ml)
        self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[0], self.raw_output_hipe4ml)
        self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[2], self.raw_output_hipe4ml)

        modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl'
        self.p_hipe4ml_model.dump_model_handler(modelhandlerfile)
        modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model'
        self.p_hipe4ml_model.dump_original_model(modelfile)

        self.logger.info("Training + testing hipe4ml: Done!")
        self.logger.info("Time elapsed = %.3f", time.time() - t0)

    def do_hipe4mlplot(self):
        self.logger.info("Plotting hipe4ml model")

        leglabels = ["Background", "Prompt signal"]
        outputlabels = ["Bkg", "SigPrompt"]

        # _____________________________________________
        plot_utils.plot_distr([self.bkghandler, self.signalhandler],
                              self.v_train, 100, leglabels)
        plt.subplots_adjust(left=0.06,
                            bottom=0.06,
                            right=0.99,
                            top=0.96,
                            hspace=0.55,
                            wspace=0.55)
        figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        plt.savefig(figname)
        plt.close('all')
        # _____________________________________________
        corrmatrixfig = plot_utils.plot_corr(
            [self.bkghandler, self.signalhandler], self.v_train, leglabels)
        for figg, labb in zip(corrmatrixfig, outputlabels):
            plt.figure(figg.number)
            plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
            figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf'
            figg.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 7)
        mloutputfig = plot_utils.plot_output_train_test(
            self.p_hipe4ml_model,
            self.traintestdata,
            80,
            self.raw_output_hipe4ml,
            leglabels,
            self.train_test_log_hipe4ml,
            density=True)
        figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        mloutputfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvefig = plot_utils.plot_roc(self.traintestdata[3],
                                          self.ypredtest_hipe4ml, None,
                                          leglabels,
                                          self.average_method_hipe4ml,
                                          self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvefig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvettfig = plot_utils.plot_roc_train_test(
            self.traintestdata[3], self.ypredtest_hipe4ml,
            self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels,
            self.average_method_hipe4ml, self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvettfig.savefig(figname)
        # _____________________________________________
        precisionrecallfig = plot_utils.plot_precision_recall(
            self.traintestdata[3], self.ypredtest_hipe4ml, leglabels)
        figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        precisionrecallfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (12, 7)
        featuresimportancefig = plot_utils.plot_feature_imp(
            self.traintestdata[2][self.v_train], self.traintestdata[3],
            self.p_hipe4ml_model, leglabels)
        for i in range(0, len(featuresimportancefig)):
            figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_'
                       f'pT_{self.p_binmin}_{self.p_binmax}.pdf')
            featuresimportancefig[i].savefig(figname)
Ejemplo n.º 4
0
def train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData):
    '''
    function for model training and testing
    '''
    modelClf = xgb.XGBClassifier()
    TrainCols = inputCfg['ml']['training_columns']
    HyperPars = inputCfg['ml']['hyper_par']
    if not isinstance(TrainCols, list):
        print('ERROR: training columns must be defined!')
        sys.exit()
    if not isinstance(HyperPars, dict):
        print('ERROR: hyper-parameters must be defined or be an empty dict!')
        sys.exit()
    ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars)

    # hyperparams optimization --> not working with multi-class classification at the moment
    #HypRanges = {
    #    # # defines the maximum depth of a single tree (regularization)
    #    'max_depth': (1, 30),
    #    'learning_rate': (0.01, 0.3),  # learning rate
    #    'n_estimators': (50, 1000)  # number of boosting trees
    #}
    #ModelHandl.optimize_params_bayes(TrainTestData, HypRanges, None)

    # train and test the model with the updated hyperparameters
    ModelHandl.train_test_model(TrainTestData)
    yPredTest = ModelHandl.predict(TrainTestData[2],
                                   inputCfg['ml']['raw_output'], True)

    # save model handler in pickle
    ModelHandl.dump_model_handler(
        f'{OutPutDirPt}/ModelHandler_pT_{PtMin}_{PtMax}.pickle')

    #plots
    LegLabels = inputCfg['output']['leg_labels']
    OutputLabels = inputCfg['output']['out_labels']
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (10, 7)
    MLOutputFig = plot_utils.plot_output_train_test(
        ModelHandl,
        TrainTestData,
        80,
        inputCfg['ml']['raw_output'],
        LegLabels,
        True,
        inputCfg['plots']['train_test_log'],
        density=True)
    for Fig, Lab in zip(MLOutputFig, OutputLabels):
        Fig.savefig(f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtMin}_{PtMax}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (8, 7)
    ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, LegLabels)
    ROCCurveFig.savefig(f'{OutPutDirPt}/ROCCurveAll_pT_{PtMin}_{PtMax}.pdf')
    #_____________________________________________
    PrecisionRecallFig = plot_utils.plot_precision_recall(
        TrainTestData[3], yPredTest, LegLabels)
    PrecisionRecallFig.savefig(
        f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtMin}_{PtMax}.pdf')
    #_____________________________________________
    plt.rcParams["figure.figsize"] = (12, 7)
    FeaturesImportanceFig = plot_utils.plot_feature_imp(
        TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl)
    for iFig, Fig in enumerate(FeaturesImportanceFig):
        if iFig < 3:
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportance{OutputLabels[iFig]}_pT_{PtMin}_{PtMax}.pdf'
            )
        else:
            Fig.savefig(
                f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtMin}_{PtMax}.pdf')

    return ModelHandl
                                        init_points=10,
                                        n_iter=20)

    y_pred_test = model_hdl.train_test_model(train_test_data, True, True)

    bdt_out_plot = pu.plot_output_train_test(model_hdl,
                                             train_test_data,
                                             100,
                                             True, ["Signal", "Background"],
                                             True,
                                             density=True)
    bdt_out_plot.savefig(results_ml_path + "/bdt_output.png")

    if not os.path.exists(ml_model_path):
        os.makedirs(ml_model_path)
    model_hdl.dump_model_handler(ml_model_path + "/model_hndl.pkl")

    feature_importance_plot = pu.plot_feature_imp(train_test_data[2],
                                                  train_test_data[3],
                                                  model_hdl)
    feature_importance_plot[0].savefig(results_ml_path +
                                       "/feature_importance_1.png")
    feature_importance_plot[1].savefig(results_ml_path +
                                       "/feature_importance_2.png")

    eff_arr = np.round(np.arange(0.5, 0.99, 0.01), 2)
    score_eff_arr = au.score_from_efficiency_array(train_test_data[3],
                                                   y_pred_test, eff_arr)

    if not os.path.exists(efficiencies_path):
        os.makedirs(efficiencies_path)