Beispiel #1
0
plt.savefig('../opt_comp.png', dpi = 100, facecolor = 'white')
plt.close()

##################################################################################

# BEST HYPERPARAMETERS FOR EACH METHOD

names = ['Opt_test_OPTUNA', 'Opt_test_BAYES', 'Opt_test_DEFAULT', 'Opt_test_PbPb']

if False:
    for name in names:
        model_hdl = ModelHandler()
        model_hdl.load_model_handler('../analysis_results/' + name + '/model/model_hdl')

        print(name)
        print(model_hdl.get_model_params())
        print('\n---------------\n')

##################################################################################

# PLOT SUPERIMPOSED ROC
'''
plt.close()
objects = []

for n in names:
    with (open('../analysis_results/' + n + '/images/training/ROC_AUC_train_test.pickle', "rb")) as openfile:
        while True:
            try:
                objects.append(pickle.load(openfile))
            except EOFError:
Beispiel #2
0
                        # write test set data frame
                        train_test_data_cent[2]['model_output'] = test_y_score
                        train_test_data_cent[2][
                            'y_true'] = train_test_data_cent[3]
                        train_test_data_cent_tmp = train_test_data_cent[2].query(
                            f'y_true > 0.5 and ct >= {ct_bins_df[0]} and ct < {ct_bins_df[1]}'
                        )
                        train_test_data_cent_tmp.to_parquet(f'df/mc_{bin_df}',
                                                            compression='gzip')

                # get the model hyperparameters
                if DUMP_HYPERPARAMS and TRAIN:
                    if not os.path.isdir('hyperparams'):
                        os.mkdir('hyperparams')
                    model_params_dict = model_hdl.get_model_params()
                    with open(f'hyperparams/model_params_{bin}.yml',
                              'w') as outfile:
                        yaml.dump(model_params_dict,
                                  outfile,
                                  default_flow_style=False)

                    # save roc-auc
                del train_test_data_cent
                ##############################################################

    if COMPUTE_SCORES_FROM_EFF and TRAIN:
        pickle.dump(score_eff_arrays_dict, open("file_score_eff_dict", "wb"))

# apply model to data
if APPLICATION:
Beispiel #3
0
def train_xgboost_model(signal,
                        background,
                        filename_dict,
                        params,
                        params_range,
                        flag_dict,
                        training_variables='',
                        testsize=0.5):
    '''
    Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance
    '''

    print('Training XGBOOST model')

    training_fig_path = filename_dict['analysis_path'] + "/images/training"

    train_test_data = train_test_generator([signal, background], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)
    if not flag_dict['use_default_param']:
        model_hdl.set_model_params(params)

    if flag_dict['benchmark_opt']:

        print('Benchamarking optimizers\n')
        import time
        from sklearn.metrics import roc_auc_score
        times_sk = []
        roc_sk = []

        for i in range(1):
            start = time.time()

            model_hdl.optimize_params_bayes(train_test_data,
                                            params_range,
                                            'roc_auc',
                                            njobs=-1)
            model_hdl.train_test_model(train_test_data, )

            y_pred_test = model_hdl.predict(
                train_test_data[2], True)  #used to evaluate model performance

            roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test))

            times_sk.append(time.time() - start)

        print('\nBAYES OPTIMIZATION WITH SKLEARN')
        print('Mean time : ' + str(np.mean(times_sk)))
        print('Mean ROC : ' + str(np.mean(roc_sk)))
        print('--------------\n')
        print('OPTUNA')

        time = []
        roc = []

        for i in range(1):

            for key in params:
                if isinstance(params[key], str):
                    params_range[key] = params[key]

            model_hdl.optimize_params_optuna(train_test_data,
                                             params_range,
                                             'roc_auc',
                                             timeout=flag_dict['timeout'],
                                             n_jobs=flag_dict['n_jobs'])
            model_hdl.train_test_model(train_test_data, )

            y_pred_test = model_hdl.predict(
                train_test_data[2], True)  #used to evaluate model performance

            roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        print('\nBAYES OPTIMIZATION WITH SKLEARN')
        print('Mean time : ' + str(np.mean(times_sk)))
        print('Mean ROC : ' + str(np.mean(roc_sk)))
        print('--------------\n')
        print('OPTUNA')
        print('Fixed time : ' + str(np.mean(time)))
        print('Mean ROC : ' + str(np.mean(roc)))
        print('--------------\n')

    if flag_dict['optimize_bayes']:
        import time
        print('Doing Bayes optimization of hyperparameters\n')
        start = time.time()
        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        n_iter=700,
                                        njobs=flag_dict['n_jobs'])
        print('Elapsed time: ' + str(time.time() - start))

    if flag_dict['optimize_optuna']:
        print('Doing Optuna optimization of hyperparameters\n')
        for key in params:
            if isinstance(params[key], str):
                params_range[key] = params[key]
        study = model_hdl.optimize_params_optuna(train_test_data,
                                                 params_range,
                                                 scoring='roc_auc',
                                                 timeout=flag_dict['timeout'],
                                                 n_jobs=flag_dict['n_jobs'],
                                                 n_trials=None)

        print('Parameters optimization done!\n')

        if flag_dict['plot_optim']:
            print('Saving optimization plots')
            fig = optuna.visualization.plot_slice(study)
            fig.write_image(training_fig_path + '/optuna_slice.png')
            fig = optuna.visualization.plot_optimization_history(study)
            fig.write_image(training_fig_path + '/optuna_history.png')
            '''fig = optuna.visualization.plot_param_importances(study)
            fig.write_image(training_fig_path + '/optuna_param_importance.png')
            fig = optuna.visualization.plot_contour(study)
            fig.write_image(training_fig_path + '/optuna_contour.png')'''
            print('Done\n')

        import joblib

        joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl")

    model_hdl.train_test_model(train_test_data, )
    print(model_hdl.get_model_params())

    print('Predicting values on training and test datas')
    y_pred_train = model_hdl.predict(train_test_data[0], True)
    y_pred_test = model_hdl.predict(train_test_data[2],
                                    True)  #used to evaluate model performance
    print('Prediction done\n')

    plt.rcParams["figure.figsize"] = (10, 7)
    leg_labels = ['background', 'signal']

    print('Saving Output comparison plot')
    plt.figure()
    ml_out_fig = plot_utils.plot_output_train_test(model_hdl,
                                                   train_test_data,
                                                   100,
                                                   True,
                                                   leg_labels,
                                                   True,
                                                   density=False)
    plt.savefig(training_fig_path + '/output_train_test.png',
                dpi=300,
                facecolor='white')
    plt.close()
    print('Done\n')

    print('Saving ROC AUC plot')
    plt.figure()
    roc_train_test_fig = plot_utils.plot_roc_train_test(
        train_test_data[3], y_pred_test, train_test_data[1], y_pred_train,
        None, leg_labels)  #ROC AUC plot
    plt.savefig(training_fig_path + '/ROC_AUC_train_test.png',
                dpi=300,
                facecolor='white')

    import pickle
    with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f:
        pickle.dump(roc_train_test_fig, f)
    plt.close()

    print('Done\n')

    print('Saving feature importance plots')
    plt.figure()
    feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2],
                                                         train_test_data[3],
                                                         model_hdl,
                                                         approximate=True)
    feat_imp_1.savefig(training_fig_path +
                       '/feature_importance_HIPE4ML_violin.png',
                       dpi=300,
                       facecolor='white')
    feat_imp_2.savefig(training_fig_path +
                       '/feature_importance_HIPE4ML_bar.png',
                       dpi=300,
                       facecolor='white')
    plt.close()
    print('Done\n')

    efficiency_score_conversion(train_test_data, y_pred_test, filename_dict)

    return train_test_data, y_pred_test, model_hdl