Beispiel #1
0
def train_xgboost_model(signal,
                        background,
                        filename_dict,
                        params,
                        params_range,
                        flag_dict,
                        training_variables='',
                        testsize=0.5):
    '''
    Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance
    '''

    print('Training XGBOOST model')

    training_fig_path = filename_dict['analysis_path'] + "/images/training"

    train_test_data = train_test_generator([signal, background], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)
    if not flag_dict['use_default_param']:
        model_hdl.set_model_params(params)

    if flag_dict['benchmark_opt']:

        print('Benchamarking optimizers\n')
        import time
        from sklearn.metrics import roc_auc_score
        times_sk = []
        roc_sk = []

        for i in range(1):
            start = time.time()

            model_hdl.optimize_params_bayes(train_test_data,
                                            params_range,
                                            'roc_auc',
                                            njobs=-1)
            model_hdl.train_test_model(train_test_data, )

            y_pred_test = model_hdl.predict(
                train_test_data[2], True)  #used to evaluate model performance

            roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test))

            times_sk.append(time.time() - start)

        print('\nBAYES OPTIMIZATION WITH SKLEARN')
        print('Mean time : ' + str(np.mean(times_sk)))
        print('Mean ROC : ' + str(np.mean(roc_sk)))
        print('--------------\n')
        print('OPTUNA')

        time = []
        roc = []

        for i in range(1):

            for key in params:
                if isinstance(params[key], str):
                    params_range[key] = params[key]

            model_hdl.optimize_params_optuna(train_test_data,
                                             params_range,
                                             'roc_auc',
                                             timeout=flag_dict['timeout'],
                                             n_jobs=flag_dict['n_jobs'])
            model_hdl.train_test_model(train_test_data, )

            y_pred_test = model_hdl.predict(
                train_test_data[2], True)  #used to evaluate model performance

            roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        print('\nBAYES OPTIMIZATION WITH SKLEARN')
        print('Mean time : ' + str(np.mean(times_sk)))
        print('Mean ROC : ' + str(np.mean(roc_sk)))
        print('--------------\n')
        print('OPTUNA')
        print('Fixed time : ' + str(np.mean(time)))
        print('Mean ROC : ' + str(np.mean(roc)))
        print('--------------\n')

    if flag_dict['optimize_bayes']:
        import time
        print('Doing Bayes optimization of hyperparameters\n')
        start = time.time()
        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        n_iter=700,
                                        njobs=flag_dict['n_jobs'])
        print('Elapsed time: ' + str(time.time() - start))

    if flag_dict['optimize_optuna']:
        print('Doing Optuna optimization of hyperparameters\n')
        for key in params:
            if isinstance(params[key], str):
                params_range[key] = params[key]
        study = model_hdl.optimize_params_optuna(train_test_data,
                                                 params_range,
                                                 scoring='roc_auc',
                                                 timeout=flag_dict['timeout'],
                                                 n_jobs=flag_dict['n_jobs'],
                                                 n_trials=None)

        print('Parameters optimization done!\n')

        if flag_dict['plot_optim']:
            print('Saving optimization plots')
            fig = optuna.visualization.plot_slice(study)
            fig.write_image(training_fig_path + '/optuna_slice.png')
            fig = optuna.visualization.plot_optimization_history(study)
            fig.write_image(training_fig_path + '/optuna_history.png')
            '''fig = optuna.visualization.plot_param_importances(study)
            fig.write_image(training_fig_path + '/optuna_param_importance.png')
            fig = optuna.visualization.plot_contour(study)
            fig.write_image(training_fig_path + '/optuna_contour.png')'''
            print('Done\n')

        import joblib

        joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl")

    model_hdl.train_test_model(train_test_data, )
    print(model_hdl.get_model_params())

    print('Predicting values on training and test datas')
    y_pred_train = model_hdl.predict(train_test_data[0], True)
    y_pred_test = model_hdl.predict(train_test_data[2],
                                    True)  #used to evaluate model performance
    print('Prediction done\n')

    plt.rcParams["figure.figsize"] = (10, 7)
    leg_labels = ['background', 'signal']

    print('Saving Output comparison plot')
    plt.figure()
    ml_out_fig = plot_utils.plot_output_train_test(model_hdl,
                                                   train_test_data,
                                                   100,
                                                   True,
                                                   leg_labels,
                                                   True,
                                                   density=False)
    plt.savefig(training_fig_path + '/output_train_test.png',
                dpi=300,
                facecolor='white')
    plt.close()
    print('Done\n')

    print('Saving ROC AUC plot')
    plt.figure()
    roc_train_test_fig = plot_utils.plot_roc_train_test(
        train_test_data[3], y_pred_test, train_test_data[1], y_pred_train,
        None, leg_labels)  #ROC AUC plot
    plt.savefig(training_fig_path + '/ROC_AUC_train_test.png',
                dpi=300,
                facecolor='white')

    import pickle
    with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f:
        pickle.dump(roc_train_test_fig, f)
    plt.close()

    print('Done\n')

    print('Saving feature importance plots')
    plt.figure()
    feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2],
                                                         train_test_data[3],
                                                         model_hdl,
                                                         approximate=True)
    feat_imp_1.savefig(training_fig_path +
                       '/feature_importance_HIPE4ML_violin.png',
                       dpi=300,
                       facecolor='white')
    feat_imp_2.savefig(training_fig_path +
                       '/feature_importance_HIPE4ML_bar.png',
                       dpi=300,
                       facecolor='white')
    plt.close()
    print('Done\n')

    efficiency_score_conversion(train_test_data, y_pred_test, filename_dict)

    return train_test_data, y_pred_test, model_hdl
Beispiel #2
0
def benchmark_hyperparam_optimizers(filename_dict,
                                    params,
                                    params_range,
                                    flag_dict,
                                    presel_dict,
                                    training_variables='',
                                    testsize=0.75):

    import time
    from sklearn.metrics import roc_auc_score

    N_run = 1

    data_path = filename_dict['data_path']
    analysis_path = filename_dict['analysis_path']

    print('Loading MC signal')
    mc_signal = TreeHandler()
    mc_signal.get_handler_from_large_file(
        file_name=data_path + filename_dict['MC_signal_filename'],
        tree_name=filename_dict['MC_signal_table'])
    print('MC signal loaded\n')

    print('Loading background data for training')
    background_ls = TreeHandler()
    background_ls.get_handler_from_large_file(
        file_name=data_path + filename_dict['train_bckg_filename'],
        tree_name=filename_dict['train_bckg_table'])
    background_ls.apply_preselections(presel_dict['train_bckg_presel'])
    background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(),
                                              mc_signal.get_n_cand() * 4))
    print('Done\n')

    train_test_data = train_test_generator([mc_signal, background_ls], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)

    times = []
    roc = []

    for i in range(N_run):
        start = time.time()

        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        times.append(time.time() - start)

    print('BAYES OPTIMIZATION WITH SKLEARN')
    print('Mean time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')

    for i in range(N_run):
        model_hdl.optimize_params_optuna(train_test_data,
                                         params_range,
                                         'roc_auc',
                                         timeout=np.mean(times),
                                         njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

    print('OPTUNA')
    print('Fixed time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')
Beispiel #3
0
                # features plot
                leg_labels = ['background', 'non_prompt', 'prompt']
                
                model_clf = xgb.XGBClassifier(use_label_encoder=False, n_jobs=4)
                model_hdl = ModelHandler(model_clf, TRAINING_COLUMNS_LIST)
                model_hdl.set_model_params(HYPERPARAMS)

                # hyperparameters optimization and model training
                if not os.path.isdir('models'):
                    os.mkdir('models')
                bin_model = bin
                if MERGE_CENTRALITY:
                    bin_model = f'all_0_90_{ct_bins[0]}_{ct_bins[1]}'

                if OPTIMIZE and TRAIN:
                    model_hdl.optimize_params_optuna(train_test_data, HYPERPARAMS_RANGES,
                                                    'roc_auc_ovr', nfold=5, timeout=30)

                isModelTrained = os.path.isfile(f'models/{bin_model}_trained')
                print(f'isModelTrained {bin_model}: {isModelTrained}')
                if TRAIN and not isModelTrained:
                    print(
                    f'Number of candidates ({split}) for training in {ct_bins[0]} <= ct < {ct_bins[1]} cm: {len(train_test_data[0])}')
                    print(
                    f'prompt candidates: {np.count_nonzero(train_test_data[1] == 2)}; non-prompt candidates: {np.count_nonzero(train_test_data[1] == 1)}; background candidates: {np.count_nonzero(train_test_data[1] == 0)}; n_cand_bkg / n_cand_signal = {np.count_nonzero(train_test_data[1] == 0) / np.count_nonzero(train_test_data[1] == 1)}')
                    #weights={0:1,1:2,2:1}
                    #sample_weights = compute_sample_weight(class_weight=weights,y=train_test_data[0]['y_true'])
                    model_hdl.train_test_model(train_test_data, multi_class_opt="ovr", return_prediction=True, output_margin=False) #, sample_weight=sample_weights)
                    model_file_name = str(f'models/{bin_model}_trained')
                    if OPTIMIZE:
                        model_file_name = str(f'models/{bin_model}_optimized_trained')
                    model_hdl.dump_model_handler(model_file_name)