isModelTrained = os.path.isfile(f'models/{bin_model}_trained') print(f'isModelTrained {bin_model}: {isModelTrained}') if TRAIN and not isModelTrained: print( f'Number of candidates ({split}) for training in {ct_bins[0]} <= ct < {ct_bins[1]} cm: {len(train_test_data[0])}' ) print( f'signal candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}' ) model_hdl.train_test_model(train_test_data, return_prediction=True) model_file_name = str(f'models/{bin_model}_trained') if OPTIMIZE: model_file_name = str( f'models/{bin_model}_optimized_trained') model_hdl.dump_model_handler(model_file_name) elif COMPUTE_SCORES_FROM_EFF and isModelTrained: print('Model trained...') if OPTIMIZED: model_hdl.load_model_handler( f'models/{bin_model}_trained') else: model_hdl.load_model_handler( f'models/{bin_model}_trained') else: continue ct_bins_df_index = int(ct_bins[0] / 5 - 1) for ct_bins_df in zip( CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][:-1], CT_BINS_APPLY[i_cent_bins][ct_bins_df_index][1:]):
def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin): #pylint: disable=too-many-statements, too-many-branches ''' function for model training and testing ''' n_classes = len(np.unique(TrainTestData[3])) modelClf = xgb.XGBClassifier(use_label_encoder=False) TrainCols = inputCfg['ml']['training_columns'] HyperPars = inputCfg['ml']['hyper_par'][iBin] if not isinstance(TrainCols, list): print('\033[91mERROR: training columns must be defined!\033[0m') sys.exit() if not isinstance(HyperPars, dict): print( '\033[91mERROR: hyper-parameters must be defined or be an empty dict!\033[0m' ) sys.exit() ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars) # hyperparams optimization if inputCfg['ml']['hyper_par_opt']['do_hyp_opt']: print('Perform bayesian optimization') BayesOptConfig = inputCfg['ml']['hyper_par_opt']['bayes_opt_config'] if not isinstance(BayesOptConfig, dict): print('\033[91mERROR: bayes_opt_config must be defined!\033[0m') sys.exit() if n_classes > 2: average_method = inputCfg['ml']['roc_auc_average'] roc_method = inputCfg['ml']['roc_auc_approach'] if not (average_method in ['macro', 'weighted'] and roc_method in ['ovo', 'ovr']): print( '\033[91mERROR: selected ROC configuration is not valid!\033[0m' ) sys.exit() if average_method == 'weighted': metric = f'roc_auc_{roc_method}_{average_method}' else: metric = f'roc_auc_{roc_method}' else: metric = 'roc_auc' print('Performing hyper-parameters optimisation: ...', end='\r') OutFileHypPars = open( f'{OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt', 'wt') sys.stdout = OutFileHypPars ModelHandl.optimize_params_bayes( TrainTestData, BayesOptConfig, metric, nfold=inputCfg['ml']['hyper_par_opt']['nfolds'], init_points=inputCfg['ml']['hyper_par_opt']['initpoints'], n_iter=inputCfg['ml']['hyper_par_opt']['niter'], njobs=inputCfg['ml']['hyper_par_opt']['njobs']) OutFileHypPars.close() sys.stdout = sys.__stdout__ print('Performing hyper-parameters optimisation: Done!') print( f'Output saved in {OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt' ) print(f'Best hyper-parameters:\n{ModelHandl.get_model_params()}') else: ModelHandl.set_model_params(HyperPars) # train and test the model with the updated hyper-parameters yPredTest = ModelHandl.train_test_model( TrainTestData, True, output_margin=inputCfg['ml']['raw_output'], average=inputCfg['ml']['roc_auc_average'], multi_class_opt=inputCfg['ml']['roc_auc_approach']) yPredTrain = ModelHandl.predict(TrainTestData[0], inputCfg['ml']['raw_output']) # save model handler in pickle ModelHandl.dump_model_handler( f'{OutPutDirPt}/ModelHandler_pT_{PtBin[0]}_{PtBin[1]}.pickle') ModelHandl.dump_original_model( f'{OutPutDirPt}/XGBoostModel_pT_{PtBin[0]}_{PtBin[1]}.model', True) #plots LegLabels = [ inputCfg['output']['leg_labels']['Bkg'], inputCfg['output']['leg_labels']['Prompt'] ] if inputCfg['output']['leg_labels']['FD'] is not None: LegLabels.append(inputCfg['output']['leg_labels']['FD']) OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) MLOutputFig = plot_utils.plot_output_train_test( ModelHandl, TrainTestData, 80, inputCfg['ml']['raw_output'], LegLabels, inputCfg['plots']['train_test_log'], density=True) if n_classes > 2: for Fig, Lab in zip(MLOutputFig, OutputLabels): Fig.savefig( f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) else: MLOutputFig.savefig( f'{OutPutDirPt}/MLOutputDistr_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, None, LegLabels, inputCfg['ml']['roc_auc_average'], inputCfg['ml']['roc_auc_approach']) ROCCurveFig.savefig( f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') pickle.dump( ROCCurveFig, open(f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pkl', 'wb')) #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) ROCCurveTTFig = plot_utils.plot_roc_train_test( TrainTestData[3], yPredTest, TrainTestData[1], yPredTrain, None, LegLabels, inputCfg['ml']['roc_auc_average'], inputCfg['ml']['roc_auc_approach']) ROCCurveTTFig.savefig( f'{OutPutDirPt}/ROCCurveTrainTest_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ PrecisionRecallFig = plot_utils.plot_precision_recall( TrainTestData[3], yPredTest, LegLabels) PrecisionRecallFig.savefig( f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) FeaturesImportanceFig = plot_utils.plot_feature_imp( TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl, LegLabels) n_plot = n_classes if n_classes > 2 else 1 for iFig, Fig in enumerate(FeaturesImportanceFig): if iFig < n_plot: label = OutputLabels[iFig] if n_classes > 2 else '' Fig.savefig( f'{OutPutDirPt}/FeatureImportance{label}_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) else: Fig.savefig( f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) return ModelHandl
class Optimiserhipe4mltree: # Class Attribute species = "optimiser_hipe4mltree" def __init__(self, data_param, binmin, binmax, training_var, bkg_sel, hyper_pars): self.logger = get_logger() # directory #self.do_mlprefilter = datap.get("doml_asprefilter", None) self.dirmlout = data_param["ml"]["mlout"] self.dirmlplot = data_param["ml"]["mlplot"] #if self.do_mlprefilter is True: # self.dirmodel = self.dirmodel + "/prefilter" # self.dirmlplot = self.dirmlplot + "/prefilter" #if self.do_mlprefilter is False: # self.dirmodel = self.dirmodel + "/analysis" # self.dirmlplot = self.dirmlplot + "/analysis" self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root" self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root" self.v_train = None self.p_binmin = binmin self.p_binmax = binmax self.s_selsigml = "" self.s_selbkgml = bkg_sel #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00" self.v_bkgoversigfrac = 3 self.v_sig = 1 self.v_bkg = 0 self.rnd_splt = data_param["ml"]["rnd_splt"] self.test_frac = data_param["ml"]["test_frac"] self.prompthandler = None self.datahandler = None self.bkghandler = None self.traintestdata = None self.ypredtrain_hipe4ml = None self.ypredtest_hipe4ml = None self.preparesample() self.p_hipe4ml_model = None self.v_hipe4ml_pars = hyper_pars self.load_hipe4mlmodel() self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][ "bayes_opt_config"] self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"] self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"] self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"] self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"] self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"] self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"] self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"] self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"] self.multiclass_labels = data_param["ml"].get("multiclass_labels", None) self.logger.info("Using the following training variables: %s", self.v_train) def preparesample(self): self.logger.info("Prepare Sample for hipe4ml") self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus') nsigcand = self.signalhandler.get_n_cand() self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus') self.bkghandler = self.datahandler.get_subset(self.s_selbkgml, size=nsigcand * self.v_bkgoversigfrac) self.traintestdata = train_test_generator( [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg], test_size=self.test_frac, random_state=self.rnd_splt) def load_hipe4mlmodel(self): self.logger.info("Loading hipe4ml model") self.v_train = self.signalhandler.get_var_names() self.v_train.remove('inv_mass') self.v_train.remove('pt_cand') model_xgboost = xgb.XGBClassifier() self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train) def set_hipe4ml_modelpar(self): self.logger.info("Setting hipe4ml hyperparameters") self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars) def do_hipe4mlhyperparopti(self): self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)") if not (self.average_method_hipe4ml in ['macro', 'weighted'] and self.roc_method_hipe4ml in ['ovo', 'ovr']): self.logger.fatal("Selected ROC configuration is not valid!") if self.average_method_hipe4ml == 'weighted': metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}' else: metric = f'roc_auc_{self.roc_method_hipe4ml}' hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt' outfilehyppars = open(hypparsfile, 'wt') sys.stdout = outfilehyppars self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata, self.bayesoptconfig_hipe4ml, metric, self.nfold_hipe4ml, self.init_points, self.n_iter_hipe4ml, self.njobs_hipe4ml) outfilehyppars.close() sys.stdout = sys.__stdout__ self.logger.info("Performing hyper-parameters optimisation: Done!") def do_hipe4mltrain(self): self.logger.info("Training + testing hipe4ml model") t0 = time.time() self.p_hipe4ml_model.train_test_model(self.traintestdata, self.average_method_hipe4ml, self.roc_method_hipe4ml) self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[0], self.raw_output_hipe4ml) self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[2], self.raw_output_hipe4ml) modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl' self.p_hipe4ml_model.dump_model_handler(modelhandlerfile) modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model' self.p_hipe4ml_model.dump_original_model(modelfile) self.logger.info("Training + testing hipe4ml: Done!") self.logger.info("Time elapsed = %.3f", time.time() - t0) def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)
def train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData): ''' function for model training and testing ''' modelClf = xgb.XGBClassifier() TrainCols = inputCfg['ml']['training_columns'] HyperPars = inputCfg['ml']['hyper_par'] if not isinstance(TrainCols, list): print('ERROR: training columns must be defined!') sys.exit() if not isinstance(HyperPars, dict): print('ERROR: hyper-parameters must be defined or be an empty dict!') sys.exit() ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars) # hyperparams optimization --> not working with multi-class classification at the moment #HypRanges = { # # # defines the maximum depth of a single tree (regularization) # 'max_depth': (1, 30), # 'learning_rate': (0.01, 0.3), # learning rate # 'n_estimators': (50, 1000) # number of boosting trees #} #ModelHandl.optimize_params_bayes(TrainTestData, HypRanges, None) # train and test the model with the updated hyperparameters ModelHandl.train_test_model(TrainTestData) yPredTest = ModelHandl.predict(TrainTestData[2], inputCfg['ml']['raw_output'], True) # save model handler in pickle ModelHandl.dump_model_handler( f'{OutPutDirPt}/ModelHandler_pT_{PtMin}_{PtMax}.pickle') #plots LegLabels = inputCfg['output']['leg_labels'] OutputLabels = inputCfg['output']['out_labels'] #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) MLOutputFig = plot_utils.plot_output_train_test( ModelHandl, TrainTestData, 80, inputCfg['ml']['raw_output'], LegLabels, True, inputCfg['plots']['train_test_log'], density=True) for Fig, Lab in zip(MLOutputFig, OutputLabels): Fig.savefig(f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (8, 7) ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, LegLabels) ROCCurveFig.savefig(f'{OutPutDirPt}/ROCCurveAll_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ PrecisionRecallFig = plot_utils.plot_precision_recall( TrainTestData[3], yPredTest, LegLabels) PrecisionRecallFig.savefig( f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) FeaturesImportanceFig = plot_utils.plot_feature_imp( TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl) for iFig, Fig in enumerate(FeaturesImportanceFig): if iFig < 3: Fig.savefig( f'{OutPutDirPt}/FeatureImportance{OutputLabels[iFig]}_pT_{PtMin}_{PtMax}.pdf' ) else: Fig.savefig( f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtMin}_{PtMax}.pdf') return ModelHandl
init_points=10, n_iter=20) y_pred_test = model_hdl.train_test_model(train_test_data, True, True) bdt_out_plot = pu.plot_output_train_test(model_hdl, train_test_data, 100, True, ["Signal", "Background"], True, density=True) bdt_out_plot.savefig(results_ml_path + "/bdt_output.png") if not os.path.exists(ml_model_path): os.makedirs(ml_model_path) model_hdl.dump_model_handler(ml_model_path + "/model_hndl.pkl") feature_importance_plot = pu.plot_feature_imp(train_test_data[2], train_test_data[3], model_hdl) feature_importance_plot[0].savefig(results_ml_path + "/feature_importance_1.png") feature_importance_plot[1].savefig(results_ml_path + "/feature_importance_2.png") eff_arr = np.round(np.arange(0.5, 0.99, 0.01), 2) score_eff_arr = au.score_from_efficiency_array(train_test_data[3], y_pred_test, eff_arr) if not os.path.exists(efficiencies_path): os.makedirs(efficiencies_path)