def test_plot_feature_imp(): """ Test the feature importance plot """ assert isinstance( plot_utils.plot_feature_imp(DATA[0], DATA[1], MODEL, MODEL.get_training_columns(), 20), list)
def save_ML_plots(self, model_handler, data, eff_score_array, cent_class, pt_range, ct_range, split=''): fig_path = os.environ['HYPERML_FIGURES_{}'.format(self.mode)] info_string = f'_{cent_class[0]}{cent_class[1]}_{pt_range[0]}{pt_range[1]}_{ct_range[0]}{ct_range[1]}{split}' bdt_score_dir = fig_path + '/TrainTest' bdt_eff_dir = fig_path + '/Efficiency' feat_imp_dir = fig_path + '/FeatureImp' bdt_score_plot = plot_utils.plot_output_train_test(model_handler, data, bins=100, log=True) if not os.path.exists(bdt_score_dir): os.makedirs(bdt_score_dir) bdt_score_plot.savefig(bdt_score_dir + '/BDT_Score' + info_string + '.pdf') bdt_eff_plot = plot_utils.plot_bdt_eff(eff_score_array[1], eff_score_array[0]) if not os.path.exists(bdt_eff_dir): os.makedirs(bdt_eff_dir) bdt_eff_plot.savefig(bdt_eff_dir + '/BDT_Eff' + info_string + '.pdf') feat_imp = plot_utils.plot_feature_imp( data[2] [model_handler.get_original_model().get_booster().feature_names], data[3], model_handler) if not os.path.exists(feat_imp_dir): os.makedirs(feat_imp_dir) plt.savefig(feat_imp_dir + '/FeatImp' + info_string + '.pdf') plt.close() print('ML plots saved.\n')
train_y_score = model_hdl.predict(train_test_data_cent[0]) # second condition needed because of issue with Qt libraries if MAKE_TRAIN_TEST_PLOT and not MAKE_PRESELECTION_EFFICIENCY: if not os.path.isdir(f'{PLOT_DIR}/train_test_out'): os.mkdir(f'{PLOT_DIR}/train_test_out') plot_utils.plot_output_train_test(model_hdl, train_test_data_cent, logscale=True, density=True, labels=leg_labels) plt.savefig( f'{PLOT_DIR}/train_test_out/{bin_df}_out.pdf') plot_utils.plot_feature_imp(train_test_data_cent[0], train_test_data_cent[1], model_hdl) plt.savefig( f'{PLOT_DIR}/train_test_out/feature_imp_training_{bin_df}.pdf' ) plot_utils.plot_roc_train_test(train_test_data_cent[3], test_y_score, train_test_data_cent[1], train_y_score, labels=leg_labels) plt.savefig( f'{PLOT_DIR}/train_test_out/roc_train_test_{bin_df}.pdf' ) plt.close('all') if COMPUTE_SCORES_FROM_EFF:
def train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin): #pylint: disable=too-many-statements, too-many-branches ''' function for model training and testing ''' n_classes = len(np.unique(TrainTestData[3])) modelClf = xgb.XGBClassifier(use_label_encoder=False) TrainCols = inputCfg['ml']['training_columns'] HyperPars = inputCfg['ml']['hyper_par'][iBin] if not isinstance(TrainCols, list): print('\033[91mERROR: training columns must be defined!\033[0m') sys.exit() if not isinstance(HyperPars, dict): print( '\033[91mERROR: hyper-parameters must be defined or be an empty dict!\033[0m' ) sys.exit() ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars) # hyperparams optimization if inputCfg['ml']['hyper_par_opt']['do_hyp_opt']: print('Perform bayesian optimization') BayesOptConfig = inputCfg['ml']['hyper_par_opt']['bayes_opt_config'] if not isinstance(BayesOptConfig, dict): print('\033[91mERROR: bayes_opt_config must be defined!\033[0m') sys.exit() if n_classes > 2: average_method = inputCfg['ml']['roc_auc_average'] roc_method = inputCfg['ml']['roc_auc_approach'] if not (average_method in ['macro', 'weighted'] and roc_method in ['ovo', 'ovr']): print( '\033[91mERROR: selected ROC configuration is not valid!\033[0m' ) sys.exit() if average_method == 'weighted': metric = f'roc_auc_{roc_method}_{average_method}' else: metric = f'roc_auc_{roc_method}' else: metric = 'roc_auc' print('Performing hyper-parameters optimisation: ...', end='\r') OutFileHypPars = open( f'{OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt', 'wt') sys.stdout = OutFileHypPars ModelHandl.optimize_params_bayes( TrainTestData, BayesOptConfig, metric, nfold=inputCfg['ml']['hyper_par_opt']['nfolds'], init_points=inputCfg['ml']['hyper_par_opt']['initpoints'], n_iter=inputCfg['ml']['hyper_par_opt']['niter'], njobs=inputCfg['ml']['hyper_par_opt']['njobs']) OutFileHypPars.close() sys.stdout = sys.__stdout__ print('Performing hyper-parameters optimisation: Done!') print( f'Output saved in {OutPutDirPt}/HyperParOpt_pT_{PtBin[0]}_{PtBin[1]}.txt' ) print(f'Best hyper-parameters:\n{ModelHandl.get_model_params()}') else: ModelHandl.set_model_params(HyperPars) # train and test the model with the updated hyper-parameters yPredTest = ModelHandl.train_test_model( TrainTestData, True, output_margin=inputCfg['ml']['raw_output'], average=inputCfg['ml']['roc_auc_average'], multi_class_opt=inputCfg['ml']['roc_auc_approach']) yPredTrain = ModelHandl.predict(TrainTestData[0], inputCfg['ml']['raw_output']) # save model handler in pickle ModelHandl.dump_model_handler( f'{OutPutDirPt}/ModelHandler_pT_{PtBin[0]}_{PtBin[1]}.pickle') ModelHandl.dump_original_model( f'{OutPutDirPt}/XGBoostModel_pT_{PtBin[0]}_{PtBin[1]}.model', True) #plots LegLabels = [ inputCfg['output']['leg_labels']['Bkg'], inputCfg['output']['leg_labels']['Prompt'] ] if inputCfg['output']['leg_labels']['FD'] is not None: LegLabels.append(inputCfg['output']['leg_labels']['FD']) OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) MLOutputFig = plot_utils.plot_output_train_test( ModelHandl, TrainTestData, 80, inputCfg['ml']['raw_output'], LegLabels, inputCfg['plots']['train_test_log'], density=True) if n_classes > 2: for Fig, Lab in zip(MLOutputFig, OutputLabels): Fig.savefig( f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) else: MLOutputFig.savefig( f'{OutPutDirPt}/MLOutputDistr_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, None, LegLabels, inputCfg['ml']['roc_auc_average'], inputCfg['ml']['roc_auc_approach']) ROCCurveFig.savefig( f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') pickle.dump( ROCCurveFig, open(f'{OutPutDirPt}/ROCCurveAll_pT_{PtBin[0]}_{PtBin[1]}.pkl', 'wb')) #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) ROCCurveTTFig = plot_utils.plot_roc_train_test( TrainTestData[3], yPredTest, TrainTestData[1], yPredTrain, None, LegLabels, inputCfg['ml']['roc_auc_average'], inputCfg['ml']['roc_auc_approach']) ROCCurveTTFig.savefig( f'{OutPutDirPt}/ROCCurveTrainTest_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ PrecisionRecallFig = plot_utils.plot_precision_recall( TrainTestData[3], yPredTest, LegLabels) PrecisionRecallFig.savefig( f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtBin[0]}_{PtBin[1]}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) FeaturesImportanceFig = plot_utils.plot_feature_imp( TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl, LegLabels) n_plot = n_classes if n_classes > 2 else 1 for iFig, Fig in enumerate(FeaturesImportanceFig): if iFig < n_plot: label = OutputLabels[iFig] if n_classes > 2 else '' Fig.savefig( f'{OutPutDirPt}/FeatureImportance{label}_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) else: Fig.savefig( f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtBin[0]}_{PtBin[1]}.pdf' ) return ModelHandl
def test_plot_feature_imp(): """ Test the feature importance plot """ assert isinstance(plot_utils.plot_feature_imp( DATA[0], DATA[1], MODEL, 50), list)
def train_test(inputCfg, PtMin, PtMax, OutPutDirPt, TrainTestData): ''' function for model training and testing ''' modelClf = xgb.XGBClassifier() TrainCols = inputCfg['ml']['training_columns'] HyperPars = inputCfg['ml']['hyper_par'] if not isinstance(TrainCols, list): print('ERROR: training columns must be defined!') sys.exit() if not isinstance(HyperPars, dict): print('ERROR: hyper-parameters must be defined or be an empty dict!') sys.exit() ModelHandl = ModelHandler(modelClf, TrainCols, HyperPars) # hyperparams optimization --> not working with multi-class classification at the moment #HypRanges = { # # # defines the maximum depth of a single tree (regularization) # 'max_depth': (1, 30), # 'learning_rate': (0.01, 0.3), # learning rate # 'n_estimators': (50, 1000) # number of boosting trees #} #ModelHandl.optimize_params_bayes(TrainTestData, HypRanges, None) # train and test the model with the updated hyperparameters ModelHandl.train_test_model(TrainTestData) yPredTest = ModelHandl.predict(TrainTestData[2], inputCfg['ml']['raw_output'], True) # save model handler in pickle ModelHandl.dump_model_handler( f'{OutPutDirPt}/ModelHandler_pT_{PtMin}_{PtMax}.pickle') #plots LegLabels = inputCfg['output']['leg_labels'] OutputLabels = inputCfg['output']['out_labels'] #_____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) MLOutputFig = plot_utils.plot_output_train_test( ModelHandl, TrainTestData, 80, inputCfg['ml']['raw_output'], LegLabels, True, inputCfg['plots']['train_test_log'], density=True) for Fig, Lab in zip(MLOutputFig, OutputLabels): Fig.savefig(f'{OutPutDirPt}/MLOutputDistr{Lab}_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (8, 7) ROCCurveFig = plot_utils.plot_roc(TrainTestData[3], yPredTest, LegLabels) ROCCurveFig.savefig(f'{OutPutDirPt}/ROCCurveAll_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ PrecisionRecallFig = plot_utils.plot_precision_recall( TrainTestData[3], yPredTest, LegLabels) PrecisionRecallFig.savefig( f'{OutPutDirPt}/PrecisionRecallAll_pT_{PtMin}_{PtMax}.pdf') #_____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) FeaturesImportanceFig = plot_utils.plot_feature_imp( TrainTestData[2][TrainCols], TrainTestData[3], ModelHandl) for iFig, Fig in enumerate(FeaturesImportanceFig): if iFig < 3: Fig.savefig( f'{OutPutDirPt}/FeatureImportance{OutputLabels[iFig]}_pT_{PtMin}_{PtMax}.pdf' ) else: Fig.savefig( f'{OutPutDirPt}/FeatureImportanceAll_pT_{PtMin}_{PtMax}.pdf') return ModelHandl
def train_xgboost_model(signal, background, filename_dict, params, params_range, flag_dict, training_variables='', testsize=0.5): ''' Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance ''' print('Training XGBOOST model') training_fig_path = filename_dict['analysis_path'] + "/images/training" train_test_data = train_test_generator([signal, background], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) if not flag_dict['use_default_param']: model_hdl.set_model_params(params) if flag_dict['benchmark_opt']: print('Benchamarking optimizers\n') import time from sklearn.metrics import roc_auc_score times_sk = [] roc_sk = [] for i in range(1): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test)) times_sk.append(time.time() - start) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') time = [] roc = [] for i in range(1): for key in params: if isinstance(params[key], str): params_range[key] = params[key] model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs']) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') if flag_dict['optimize_bayes']: import time print('Doing Bayes optimization of hyperparameters\n') start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', n_iter=700, njobs=flag_dict['n_jobs']) print('Elapsed time: ' + str(time.time() - start)) if flag_dict['optimize_optuna']: print('Doing Optuna optimization of hyperparameters\n') for key in params: if isinstance(params[key], str): params_range[key] = params[key] study = model_hdl.optimize_params_optuna(train_test_data, params_range, scoring='roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs'], n_trials=None) print('Parameters optimization done!\n') if flag_dict['plot_optim']: print('Saving optimization plots') fig = optuna.visualization.plot_slice(study) fig.write_image(training_fig_path + '/optuna_slice.png') fig = optuna.visualization.plot_optimization_history(study) fig.write_image(training_fig_path + '/optuna_history.png') '''fig = optuna.visualization.plot_param_importances(study) fig.write_image(training_fig_path + '/optuna_param_importance.png') fig = optuna.visualization.plot_contour(study) fig.write_image(training_fig_path + '/optuna_contour.png')''' print('Done\n') import joblib joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl") model_hdl.train_test_model(train_test_data, ) print(model_hdl.get_model_params()) print('Predicting values on training and test datas') y_pred_train = model_hdl.predict(train_test_data[0], True) y_pred_test = model_hdl.predict(train_test_data[2], True) #used to evaluate model performance print('Prediction done\n') plt.rcParams["figure.figsize"] = (10, 7) leg_labels = ['background', 'signal'] print('Saving Output comparison plot') plt.figure() ml_out_fig = plot_utils.plot_output_train_test(model_hdl, train_test_data, 100, True, leg_labels, True, density=False) plt.savefig(training_fig_path + '/output_train_test.png', dpi=300, facecolor='white') plt.close() print('Done\n') print('Saving ROC AUC plot') plt.figure() roc_train_test_fig = plot_utils.plot_roc_train_test( train_test_data[3], y_pred_test, train_test_data[1], y_pred_train, None, leg_labels) #ROC AUC plot plt.savefig(training_fig_path + '/ROC_AUC_train_test.png', dpi=300, facecolor='white') import pickle with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f: pickle.dump(roc_train_test_fig, f) plt.close() print('Done\n') print('Saving feature importance plots') plt.figure() feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2], train_test_data[3], model_hdl, approximate=True) feat_imp_1.savefig(training_fig_path + '/feature_importance_HIPE4ML_violin.png', dpi=300, facecolor='white') feat_imp_2.savefig(training_fig_path + '/feature_importance_HIPE4ML_bar.png', dpi=300, facecolor='white') plt.close() print('Done\n') efficiency_score_conversion(train_test_data, y_pred_test, filename_dict) return train_test_data, y_pred_test, model_hdl
HYP_RANGES = { # # defines the maximum depth of a single tree (regularization) 'max_depth': (5, 15), # 'learning_rate': (0.01, 0.3), # learning rate 'n_estimators': (5, 10), # number of boosting trees } MODEL.optimize_params_bayes(DATA, HYP_RANGES, 'roc_auc') # train and test the model with the updated hyperparameters MODEL.train_test_model(DATA) Y_PRED = MODEL.predict(DATA[2]) # Calculate the BDT efficiency as a function of the BDT score EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array( DATA[3], Y_PRED, n_points=10) # -------------------------------------------- # PLOTTING # -------------------------------------------- FEATURES_DISTRIBUTIONS_PLOT = plot_utils.plot_distr( [SIG_DF, BKG_DF], SIG_DF.columns) CORRELATION_MATRIX_PLOT = plot_utils.plot_corr([SIG_DF, BKG_DF], SIG_DF.columns) BDT_OUTPUT_PLOT = plot_utils.plot_output_train_test(MODEL, DATA) ROC_CURVE_PLOT = plot_utils.plot_roc(DATA[3], Y_PRED) PRECISION_RECALL_PLOT = plot_utils.plot_precision_recall(DATA[3], Y_PRED) BDT_EFFICIENCY_PLOT = plot_utils.plot_bdt_eff(THRESHOLD, EFFICIENCY) FEATURES_IMPORTANCE = plot_utils.plot_feature_imp(TEST_SET, Y_TEST, MODEL) plt.show() # ---------------------------------------------
def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)
y_pred_test = model_hdl.train_test_model(train_test_data, True, True) bdt_out_plot = pu.plot_output_train_test(model_hdl, train_test_data, 100, True, ["Signal", "Background"], True, density=True) bdt_out_plot.savefig(results_ml_path + "/bdt_output.png") if not os.path.exists(ml_model_path): os.makedirs(ml_model_path) model_hdl.dump_model_handler(ml_model_path + "/model_hndl.pkl") feature_importance_plot = pu.plot_feature_imp(train_test_data[2], train_test_data[3], model_hdl) feature_importance_plot[0].savefig(results_ml_path + "/feature_importance_1.png") feature_importance_plot[1].savefig(results_ml_path + "/feature_importance_2.png") eff_arr = np.round(np.arange(0.5, 0.99, 0.01), 2) score_eff_arr = au.score_from_efficiency_array(train_test_data[3], y_pred_test, eff_arr) if not os.path.exists(efficiencies_path): os.makedirs(efficiencies_path) np.save(efficiencies_path + "/efficiency_arr.npy", eff_arr) np.save(efficiencies_path + "/score_efficiency_arr.npy", score_eff_arr)