feat_imp = plot_utils.plot_feature_imp(train_test_data_cent[0], train_test_data_cent[1], model_hdl) for i_label, label in enumerate(leg_labels): feat_imp[i_label].savefig(f'{PLOT_DIR}/train_test_out/feature_imp_training_{bin_df}_{label}.pdf') feat_imp[3].savefig(f'{PLOT_DIR}/train_test_out/feature_imp_training_{bin_df}_all.pdf') plot_utils.plot_roc_train_test( train_test_data_cent[3], test_y_score, train_test_data_cent[1], train_y_score, labels=leg_labels, multi_class_opt="ovr") plt.savefig(f'{PLOT_DIR}/train_test_out/roc_train_test_{bin_df}.pdf') plt.close('all') if COMPUTE_SCORES_FROM_EFF: pass # get scores corresponding to BDT prompt efficiencies using test set eff_selected = np.arange(0.1, MAX_EFF, 0.01) eff, score = analysis_utils.bdt_efficiency_array( train_test_data_cent[3], test_y_score, keep_lower=False) score_list = [] for eff_val in eff_selected: interp = scipy.interpolate.InterpolatedUnivariateSpline(score, eff[2]-eff_val) score_list.append(interp.roots()[0]) score_array = np.array(score_list) score_eff_arrays_dict[bin_df] = score_array # write test set data frame # train_test_data_cent[2]['model_output'] = test_y_score # train_test_data_cent[2]['y_true'] = train_test_data_cent[3] # train_test_data_cent_tmp = train_test_data_cent[2].query(f'y_true > 1.5 and ct >= {ct_bins_df[0]} and ct < {ct_bins_df[1]}') # train_test_data_cent_tmp.to_parquet(f'df/mc_prompt_{bin_df}.parquet.gzip', compression='gzip')
DIGITS = pd.DataFrame(DIGITS_DATA.data[:, 0:10]) # pylint: disable=E1101 Y_DIGITS = DIGITS_DATA.target # pylint: disable=E1101 SIG_DF = DIGITS[Y_DIGITS == 1] BKG_DF = DIGITS[Y_DIGITS == 0] TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split( DIGITS, Y_DIGITS, test_size=0.5, random_state=42) DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST] # -------------------------------------------- # training and testing INPUT_MODEL = xgb.XGBClassifier() MODEL = ModelHandler(INPUT_MODEL) MODEL.train_test_model(DATA) Y_PRED = MODEL.predict(DATA[2]) Y_PRED_TRAIN = MODEL.predict(DATA[0]) EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array(DATA[3], Y_PRED, n_points=10) # -------------------------------------------- def test_plot_distr(): """ Test the feature distribution plot """ assert isinstance(plot_utils.plot_distr( [SIG_DF, BKG_DF], SIG_DF.columns), np.ndarray) def test_plot_corr(): """ Test the correlation matrix plot """
if OPTIMIZE: model_handler.optimize_params_bayes(data, HYPERPARAMS_RANGE, 'roc_auc', init_points=10, n_iter=10) model_handler.train_test_model(data) print("train test model") print( f'--- model trained and tested in {((time.time() - part_time) / 60):.2f} minutes ---\n' ) y_pred = model_handler.predict(data[2]) data[2].insert(0, 'score', y_pred) eff, tsd = analysis_utils.bdt_efficiency_array( data[3], y_pred, n_points=1000) score_from_eff_array = analysis_utils.score_from_efficiency_array( data[3], y_pred, FIX_EFF_ARRAY) fixed_eff_array = np.vstack( (FIX_EFF_ARRAY, score_from_eff_array)) if SIGMA_MC: ml_analysis.MC_sigma_array(data, fixed_eff_array, cclass, ptbin, ctbin, split) ml_analysis.save_ML_analysis(model_handler, fixed_eff_array, cent_class=cclass, pt_range=ptbin, ct_range=ctbin, split=split)