Example #1
0
                        feat_imp = plot_utils.plot_feature_imp(train_test_data_cent[0], train_test_data_cent[1], model_hdl)
                        for i_label, label in enumerate(leg_labels):
                            feat_imp[i_label].savefig(f'{PLOT_DIR}/train_test_out/feature_imp_training_{bin_df}_{label}.pdf')
                        feat_imp[3].savefig(f'{PLOT_DIR}/train_test_out/feature_imp_training_{bin_df}_all.pdf')
                        plot_utils.plot_roc_train_test(
                            train_test_data_cent[3],
                            test_y_score, train_test_data_cent[1],
                            train_y_score, labels=leg_labels, multi_class_opt="ovr")
                        plt.savefig(f'{PLOT_DIR}/train_test_out/roc_train_test_{bin_df}.pdf')
                        plt.close('all')

                    if COMPUTE_SCORES_FROM_EFF:
                        pass
                        # get scores corresponding to BDT prompt efficiencies using test set
                        eff_selected = np.arange(0.1, MAX_EFF, 0.01)
                        eff, score = analysis_utils.bdt_efficiency_array(
                            train_test_data_cent[3], test_y_score, keep_lower=False)

                        score_list = []
                        for eff_val in eff_selected:
                            interp = scipy.interpolate.InterpolatedUnivariateSpline(score, eff[2]-eff_val)
                            score_list.append(interp.roots()[0])
                        score_array = np.array(score_list)
                        
                        score_eff_arrays_dict[bin_df] = score_array

                        # write test set data frame
                        # train_test_data_cent[2]['model_output'] = test_y_score
                        # train_test_data_cent[2]['y_true'] = train_test_data_cent[3]
                        # train_test_data_cent_tmp = train_test_data_cent[2].query(f'y_true > 1.5 and ct >= {ct_bins_df[0]} and ct < {ct_bins_df[1]}')
                        # train_test_data_cent_tmp.to_parquet(f'df/mc_prompt_{bin_df}.parquet.gzip', compression='gzip')
Example #2
0
DIGITS = pd.DataFrame(DIGITS_DATA.data[:, 0:10])     # pylint: disable=E1101
Y_DIGITS = DIGITS_DATA.target       # pylint: disable=E1101
SIG_DF = DIGITS[Y_DIGITS == 1]
BKG_DF = DIGITS[Y_DIGITS == 0]
TRAIN_SET, TEST_SET, Y_TRAIN, Y_TEST = train_test_split(
    DIGITS, Y_DIGITS, test_size=0.5, random_state=42)
DATA = [TRAIN_SET, Y_TRAIN, TEST_SET, Y_TEST]
# --------------------------------------------

# training and testing
INPUT_MODEL = xgb.XGBClassifier()
MODEL = ModelHandler(INPUT_MODEL)
MODEL.train_test_model(DATA)
Y_PRED = MODEL.predict(DATA[2])
Y_PRED_TRAIN = MODEL.predict(DATA[0])
EFFICIENCY, THRESHOLD = analysis_utils.bdt_efficiency_array(DATA[3], Y_PRED, n_points=10)
# --------------------------------------------


def test_plot_distr():
    """
    Test the feature distribution plot
    """
    assert isinstance(plot_utils.plot_distr(
        [SIG_DF, BKG_DF], SIG_DF.columns), np.ndarray)


def test_plot_corr():
    """
    Test the correlation matrix plot
    """
Example #3
0
                    if OPTIMIZE:
                        model_handler.optimize_params_bayes(data,
                                                            HYPERPARAMS_RANGE,
                                                            'roc_auc',
                                                            init_points=10,
                                                            n_iter=10)

                    model_handler.train_test_model(data)
                    print("train test model")
                    print(
                        f'--- model trained and tested in {((time.time() - part_time) / 60):.2f} minutes ---\n'
                    )

                    y_pred = model_handler.predict(data[2])
                    data[2].insert(0, 'score', y_pred)
                    eff, tsd = analysis_utils.bdt_efficiency_array(
                        data[3], y_pred, n_points=1000)
                    score_from_eff_array = analysis_utils.score_from_efficiency_array(
                        data[3], y_pred, FIX_EFF_ARRAY)
                    fixed_eff_array = np.vstack(
                        (FIX_EFF_ARRAY, score_from_eff_array))

                    if SIGMA_MC:
                        ml_analysis.MC_sigma_array(data, fixed_eff_array,
                                                   cclass, ptbin, ctbin, split)

                    ml_analysis.save_ML_analysis(model_handler,
                                                 fixed_eff_array,
                                                 cent_class=cclass,
                                                 pt_range=ptbin,
                                                 ct_range=ctbin,
                                                 split=split)