def train_xgboost_model(signal, background, filename_dict, params, params_range, flag_dict, training_variables='', testsize=0.5): ''' Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance ''' print('Training XGBOOST model') training_fig_path = filename_dict['analysis_path'] + "/images/training" train_test_data = train_test_generator([signal, background], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) if not flag_dict['use_default_param']: model_hdl.set_model_params(params) if flag_dict['benchmark_opt']: print('Benchamarking optimizers\n') import time from sklearn.metrics import roc_auc_score times_sk = [] roc_sk = [] for i in range(1): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test)) times_sk.append(time.time() - start) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') time = [] roc = [] for i in range(1): for key in params: if isinstance(params[key], str): params_range[key] = params[key] model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs']) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') if flag_dict['optimize_bayes']: import time print('Doing Bayes optimization of hyperparameters\n') start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', n_iter=700, njobs=flag_dict['n_jobs']) print('Elapsed time: ' + str(time.time() - start)) if flag_dict['optimize_optuna']: print('Doing Optuna optimization of hyperparameters\n') for key in params: if isinstance(params[key], str): params_range[key] = params[key] study = model_hdl.optimize_params_optuna(train_test_data, params_range, scoring='roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs'], n_trials=None) print('Parameters optimization done!\n') if flag_dict['plot_optim']: print('Saving optimization plots') fig = optuna.visualization.plot_slice(study) fig.write_image(training_fig_path + '/optuna_slice.png') fig = optuna.visualization.plot_optimization_history(study) fig.write_image(training_fig_path + '/optuna_history.png') '''fig = optuna.visualization.plot_param_importances(study) fig.write_image(training_fig_path + '/optuna_param_importance.png') fig = optuna.visualization.plot_contour(study) fig.write_image(training_fig_path + '/optuna_contour.png')''' print('Done\n') import joblib joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl") model_hdl.train_test_model(train_test_data, ) print(model_hdl.get_model_params()) print('Predicting values on training and test datas') y_pred_train = model_hdl.predict(train_test_data[0], True) y_pred_test = model_hdl.predict(train_test_data[2], True) #used to evaluate model performance print('Prediction done\n') plt.rcParams["figure.figsize"] = (10, 7) leg_labels = ['background', 'signal'] print('Saving Output comparison plot') plt.figure() ml_out_fig = plot_utils.plot_output_train_test(model_hdl, train_test_data, 100, True, leg_labels, True, density=False) plt.savefig(training_fig_path + '/output_train_test.png', dpi=300, facecolor='white') plt.close() print('Done\n') print('Saving ROC AUC plot') plt.figure() roc_train_test_fig = plot_utils.plot_roc_train_test( train_test_data[3], y_pred_test, train_test_data[1], y_pred_train, None, leg_labels) #ROC AUC plot plt.savefig(training_fig_path + '/ROC_AUC_train_test.png', dpi=300, facecolor='white') import pickle with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f: pickle.dump(roc_train_test_fig, f) plt.close() print('Done\n') print('Saving feature importance plots') plt.figure() feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2], train_test_data[3], model_hdl, approximate=True) feat_imp_1.savefig(training_fig_path + '/feature_importance_HIPE4ML_violin.png', dpi=300, facecolor='white') feat_imp_2.savefig(training_fig_path + '/feature_importance_HIPE4ML_bar.png', dpi=300, facecolor='white') plt.close() print('Done\n') efficiency_score_conversion(train_test_data, y_pred_test, filename_dict) return train_test_data, y_pred_test, model_hdl
def benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, training_variables='', testsize=0.75): import time from sklearn.metrics import roc_auc_score N_run = 1 data_path = filename_dict['data_path'] analysis_path = filename_dict['analysis_path'] print('Loading MC signal') mc_signal = TreeHandler() mc_signal.get_handler_from_large_file( file_name=data_path + filename_dict['MC_signal_filename'], tree_name=filename_dict['MC_signal_table']) print('MC signal loaded\n') print('Loading background data for training') background_ls = TreeHandler() background_ls.get_handler_from_large_file( file_name=data_path + filename_dict['train_bckg_filename'], tree_name=filename_dict['train_bckg_table']) background_ls.apply_preselections(presel_dict['train_bckg_presel']) background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(), mc_signal.get_n_cand() * 4)) print('Done\n') train_test_data = train_test_generator([mc_signal, background_ls], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) times = [] roc = [] for i in range(N_run): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) times.append(time.time() - start) print('BAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') for i in range(N_run): model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=np.mean(times), njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n')
# features plot leg_labels = ['background', 'non_prompt', 'prompt'] model_clf = xgb.XGBClassifier(use_label_encoder=False, n_jobs=4) model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST) model_hdl.set_model_params(HYPERPARAMS) # hyperparameters optimization and model training if not os.path.isdir('models'): os.mkdir('models') bin_model = bin if MERGE_CENTRALITY: bin_model = f'all_0_90_{ct_bins[0]}_{ct_bins[1]}' if OPTIMIZE and TRAIN: model_hdl.optimize_params_optuna(train_test_data, HYPERPARAMS_RANGES, 'roc_auc_ovr', nfold=5, timeout=30) isModelTrained = os.path.isfile(f'models/{bin_model}_trained') print(f'isModelTrained {bin_model}: {isModelTrained}') if TRAIN and not isModelTrained: print( f'Number of candidates ({split}) for training in {ct_bins[0]} <= ct < {ct_bins[1]} cm: {len(train_test_data[0])}') print( f'prompt candidates: {np.count_nonzero(train_test_data[1] == 2)}; non-prompt candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}') #weights={0:1,1:2,2:1} #sample_weights = compute_sample_weight(class_weight=weights,y=train_test_data[0]['y_true']) model_hdl.train_test_model(train_test_data, multi_class_opt="ovr", return_prediction=True, output_margin=False) #, sample_weight=sample_weights) model_file_name = str(f'models/{bin_model}_trained') if OPTIMIZE: model_file_name = str(f'models/{bin_model}_optimized_trained') model_hdl.dump_model_handler(model_file_name)