Example #1
0
def test_HistGradientBoostingClassifier_proba():
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingClassifier

    # train a tree-based model
    X, y = shap.datasets.adult()
    model = HistGradientBoostingClassifier(max_iter=10, max_depth=6).fit(X, y)
    explainer = shap.TreeExplainer(model, shap.sample(X, 10), model_output="predict_proba")
    shap_values = explainer.shap_values(X)
    assert np.max(np.abs(shap_values[0].sum(1) + explainer.expected_value[0] - model.predict_proba(X)[:,0])) < 1e-4
                              min_samples_leaf=10)
clf2 = ExtraTreesClassifier(n_estimators=50, max_depth=10, min_samples_leaf=10)
clf3 = HistGradientBoostingClassifier(l2_regularization=1,
                                      min_samples_leaf=17,
                                      max_iter=215)
clf4 = HistGradientBoostingClassifier(l2_regularization=1)
clf5 = KNeighborsClassifier(n_neighbors=20)
clf6 = DecisionTreeClassifier(splitter='random', min_samples_split=20)

runTest(clf1, every)
runTest(clf2, every)
runTest(clf3, every, True)
runTest(clf4, every)
runTest(clf6, every, lower=0.01, upper=0.99)

sub = clf4.predict_proba(fin[every])

# take just the `id` and `n_violations` columns (since that's all we need)
submission = fin[['id']].copy()

tmp = []
for i in sub:
    if i[1] > 1:
        tmp.append(0.99)
    elif i[1] < 0:
        tmp.append(0.01)
    else:
        tmp.append(i[1])
submission['Predicted'] = tmp

# IMPORTANT: Kaggle expects you to name the columns `Id` and `Predicted`, so let's make sure here
Example #3
0
        column for column in categorical_columns
        if column != target_column_name
    ]
    model = CatBoostClassifier(cat_features=categorical_columns,
                               grow_policy='Lossguide',
                               learning_rate=0.1,
                               n_estimators=100,
                               num_leaves=255,
                               train_dir='data/catboost_info',
                               verbose=False)
    model.fit(features_train, labels_train, silent=True)

# Make predictions on the test data.
if args.library == 'h2o':
    predictions_proba = model.predict(data_test).as_data_frame()['Y']
else:
    predictions_proba = model.predict_proba(features_test)[:, 1]

# Compute metrics.
auc_roc = roc_auc_score(labels_test, predictions_proba)

# Compute memory usage.
f = open("/proc/self/status", "r")
for line in f.readlines():
    if line.startswith("VmHWM"):
        memory = line.split(":")[1].strip()

print(json.dumps({
    'auc_roc': auc_roc,
    'memory': memory,
}))
Example #4
0
    print(": SGD - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    
    hgbc_model = HistGradientBoostingClassifier(
        l2_regularization=1.766059063693552,
        learning_rate=0.10675193678150449,
        max_bins=128,
        max_depth=31,
        max_leaf_nodes=185,
        random_state=2021
    )
    hgbc_model.fit(
        hgbc_x_train,
        y_train,
    )

    train_oof_preds = hgbc_model.predict_proba(hgbc_x_valid)[:,-1]
    test_oof_preds = hgbc_model.predict_proba(test[hgbc_features])[:,-1]
    hgbc_train_preds[test_index] = train_oof_preds
    hgbc_test_preds += test_oof_preds / n_folds
    print(": HGBC - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    print("")
    
print("--> Overall metrics")
print(": XGB - ROC AUC Score = {}".format(roc_auc_score(target, xgb_train_preds, average="micro")))
print(": LGB - ROC AUC Score = {}".format(roc_auc_score(target, lgb_train_preds, average="micro")))
print(": CB - ROC AUC Score = {}".format(roc_auc_score(target, cb_train_preds, average="micro")))
print(": Ridge - ROC AUC Score = {}".format(roc_auc_score(target, ridge_train_preds, average="micro")))
print(": SGD - ROC AUC Score = {}".format(roc_auc_score(target, sgd_train_preds, average="micro")))
print(": HGBC - ROC AUC Score = {}".format(roc_auc_score(target, hgbc_train_preds, average="micro")))

# !SECTION cross validation
Example #5
0
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     early_stopping=False,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
    acc = accuracy_score(target_test, predicted_test)
    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
Example #6
0
def test_same_predictions_multiclass_classification(seed, min_samples_leaf,
                                                    n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255
    lr = 1

    X, y = make_classification(
        n_samples=n_samples,
        n_classes=3,
        n_features=5,
        n_informative=5,
        n_redundant=0,
        n_clusters_per_class=1,
        random_state=0,
    )

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss="categorical_crossentropy",
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=lr,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_sklearn = est_sklearn.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_sklearn = est_sklearn.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_same_predictions_multiclass_classification(
        seed, min_samples_leaf, n_samples, max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256
    lr = 1

    X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
                               n_informative=5, n_redundant=0,
                               n_clusters_per_class=1, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='categorical_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=lr,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    proba_lightgbm = est_lightgbm.predict_proba(X_train)
    proba_sklearn = est_sklearn.predict_proba(X_train)
    # assert more than 75% of the predicted probabilities are the same up to
    # the second decimal
    assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        proba_lightgbm = est_lightgbm.predict_proba(X_train)
        proba_sklearn = est_sklearn.predict_proba(X_train)
        # assert more than 75% of the predicted probabilities are the same up
        # to the second decimal
        assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
    for beta in np.arange(0.2, 1.01, 0.2):
        all_sample_weights = getWeights(data, alpha * wPep_t, beta * wXL_t)

        clf = HistGradientBoostingClassifier(
            scoring="f1",
            monotonic_cst=monotonic_cst,
            tol=1e-7,
            random_state=42,
            validation_fraction=None)  #, early_stopping=True)
        clf.fit(X, y, sample_weight=all_sample_weights)
        print("alpha,beta: " + str(alpha) + "\t" + str(beta))
        print("Loss on all data (sample weight): {:.2f}".format(
            clf.score(X, y, sample_weight=all_sample_weights)))

        p = clf.predict_proba(
            data.loc[:,
                     data.columns != 'Label'])[:,
                                               1]  # prob for class=1 (target)
        p = pd.DataFrame({'p-value': p})
        data.reset_index(drop=True, inplace=True)
        p.reset_index(drop=True, inplace=True)
        data2 = pd.concat([data, p], axis=1)
        data2 = calcQ(data2, scoreColName="p-value")
        data2["Rank"] = 1
        # store best fit
        nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1)
        print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc))
        print("sum(pAUC): " + str(nXLauc + XLauc))
        print("Confusion matrix:")
        print(confusion_matrix(y, clf.predict(X)))

        if nXLauc + 10.0 * XLauc > best_nXLauc + 10.0 * best_XLauc:  # we weight XL auc higher than peptide auc
Example #9
0
                           n_jobs = -1)

start_time = time.time()
grid_search = grid_search.fit(X_train, y_train)
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
grid_search.best_params_, grid_search.best_score_


# last step
clf_hgb = grid_search.best_estimator_
clf_hgb.fit(X_train, y_train)

y_pred = clf_hgb.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred = clf_hgb.predict_proba(X_test)[:, 1]
print('HGB AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred))


# KF & RS
parameters = {'learning_rate': uniform(0,0.1), 
              'max_depth':sp_randint(3, 11),
              'max_leaf_nodes':sp_randint(2, 32),
              'min_samples_leaf':sp_randint(1, 11),
              'max_iter':[400,600,800,1000,1200],
              'l2_regularization':uniform(0,0.1)}

rand_search = RandomizedSearchCV(estimator = clf_hgb,
                                 param_distributions = parameters,
                                 scoring='roc_auc',
                                 n_iter=100,
Example #10
0
    # model = HistGradientBoostingClassifier(max_iter=300)

    # GDSCV = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='roc_auc', n_jobs=-1)
    # GDSCV.fit(X_train, y_train)
    # print(GDSCV.best_params_)

    # cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    # print("Cross-validation score is {score:.3f},"
    #     " standard deviation is {err:.3f}"
    #     .format(score = cv_score.mean(), err = cv_score.std()))

    model = HistGradientBoostingClassifier(max_iter=300, l2_regularization=params[i][0], learning_rate=params[i][1], max_depth=int(params[i][2]))

    model = model.fit(X_train, y_train)
    prob = model.predict_proba(X_test)
    prob = np.array(prob[:, 1])
    y_pred[label] = prob

    print(label, ': finished!\n')


# Task 3
for i in range(11, len(labels)):
    label = labels[i]
    y_train = df_label[label]

    # # grid search
    # parameters = {
    #     'learning_rate':[0.05, 0.10, 0.15, 0.20],
    #     'max_depth':[3, 4, 5, 6, 7, 8, 9], 
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     n_iter_no_change=None,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
    acc = accuracy_score(target_test, predicted_test)
    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
Example #12
0
class tuned_HGB(BaseEstimator):
    """
    Scikit-learn histogram gradient-boosted tree models, tuned with nested
    cross-validation to minimize the error on a unseen table.
    
    Parameters
    ----------
    
    task : str
        The estimation task to perform, either 'salary', 'quantile', or 'sex'.
    learning_rate : None or float
        The learning rate of the model. If None, a nested cross-validation
        procedure is used to determine the best one.
    fit_on : str
        If fit_on = 'all', all the validation data is used to compute the
        validation error. Set fit_on = 'seen' or 'unseen' to optimize the
        learning rate for unseen or seen categories only.
        
    """
    def __init__(self, task, learning_rate=None, fit_on='all'):

        self.task = task
        self.learning_rate = learning_rate
        self.fit_on = fit_on
        return

    def param_tuning(self, X1, y1):

        D_var = make_D_var(self.X1_nem, self.X1_mem, n_jobs=1)
        n_var = n_variants(self.X1_nem,
                           self.X1_mem,
                           y1,
                           self.groups1,
                           n_splits=None,
                           test_size=None,
                           D_var=D_var,
                           n_jobs=1,
                           nested_cross_val=True)
        lr_list = np.logspace(-2, -0.5, 4)
        res = np.zeros(len(lr_list))
        for k in range(len(lr_list)):
            if self.task == "salary":
                self2 = HistGradientBoostingRegressor(learning_rate=lr_list[k])
            else:
                self2 = HistGradientBoostingClassifier(
                    learning_rate=lr_list[k])
            cv_err = cv_errors(self.task,
                               self2,
                               X1,
                               self.X1_nem,
                               self.X1_mem,
                               y1,
                               self.groups1,
                               n_splits=None,
                               test_size=None,
                               n_jobs=1,
                               nested_cross_val=True)
            if self.task != 'quantile':
                cv_err = cv_err**2
            if self.fit_on == 'unseen':
                res[k] = cv_err[n_var == 0].mean()
            elif self.fit_on == 'seen':
                res[k] = cv_err[n_var >= 1].mean()
            else:
                res[k] = cv_err.mean()
        self.learning_rate = lr_list[np.argmin(res)]
        print(int(sum(n_var == 0) / len(n_var) * 100) / 100)
        return

    def fit(self, X1, y1):

        # Parameter tuning
        if self.learning_rate == None:
            self.param_tuning(X1, y1)
            print(self.learning_rate)
        # Fit on all train data with tuned params
        if self.task == "salary":
            self.model = HistGradientBoostingRegressor(
                learning_rate=self.learning_rate)
        else:
            self.model = HistGradientBoostingClassifier(
                learning_rate=self.learning_rate)
        self.model.fit(X1, y1)
        return

    def predict(self, X2):
        return self.model.predict(X2)

    def predict_proba(self, X2):
        return self.model.predict_proba(X2)
def main():
    #====================================================
    #     DATA PREPARATION
    #====================================================

    #Let's have a look at the dataset:
    data_full = pd.read_csv('dataset_higgs_challenge.csv')

    #For this classification I used only yhe "t" (training data), "b" (validation data) and "v" (test data) set of variables:
    print('Total number of events: ', len(data_full), '\n')
    for KaggleSetID in ['t', 'b', 'v', 'u']:
        print('Number of events in the {} KaggleSet: {}'.format(
            KaggleSetID,
            len(data_full['KaggleSet'][data_full['KaggleSet'] ==
                                       KaggleSetID])))

    #Description of the sub-dataset in each line:
    #1) Splitting of the dataset into train, test and validation set.
    #2) Extracting the weights of the validation and test set.
    #3) Extracting the binary arrays for my networks.
    #4) Extracting the binary arrays for my BDT
    #Within the splitting of the dataset, have been applyied some operations on the engineering of the features for each subset. The problem is that the "phi" variables have a signal distribution that is very similar to the background one. So it's better to consider their linear combination (difference in this case) to make them useful in my classification.
    X, df_empty, y_train, y_train_BDT = splitting(data_full, "t")
    X_val, weights_val, y_val, y_val_BDT = splitting(data_full, "b")
    X_test, weights_test, y_test, y_test_BDT = splitting(data_full, "v")
    del (data_full)

    #====================================================
    #     BDT
    #====================================================

    #Let's first scale my data:
    standard = StandardScaler()
    standard.fit(X)
    X_standard = standard.transform(X)
    X_val_standard = standard.transform(X_val)
    X_test_standard = standard.transform(X_test)

    #BDT classification:
    BDT = HistGradientBoostingClassifier(max_iter=90,
                                         verbose=1,
                                         l2_regularization=0.5,
                                         learning_rate=.1,
                                         max_leaf_nodes=50,
                                         random_state=45,
                                         max_depth=15,
                                         max_bins=50)
    BDT.fit(X_standard, y_train_BDT)

    y_pred_val = BDT.predict_proba(X_val_standard)
    y_pred_test = BDT.predict_proba(X_test_standard)

    del X_standard, X_val_standard, X_test_standard

    #I will split the results just to be able to combine them with the DNN result later:
    BDT_0jets_val = y_pred_val[X_val['PRI_jet_num'] == 0]
    BDT_1jet_val = y_pred_val[X_val['PRI_jet_num'] == 1]
    BDT_2jets_val = y_pred_val[X_val['PRI_jet_num'] >= 2]

    y_pred_BDT_val = np.concatenate(
        (BDT_0jets_val, BDT_1jet_val, BDT_2jets_val))

    BDT_0jets_test = y_pred_test[X_test['PRI_jet_num'] == 0]
    BDT_1jet_test = y_pred_test[X_test['PRI_jet_num'] == 1]
    BDT_2jets_test = y_pred_test[X_test['PRI_jet_num'] >= 2]

    y_pred_BDT_test = np.concatenate(
        (BDT_0jets_test, BDT_1jet_test, BDT_2jets_test))

    #====================================================
    #     DATA PROCESSING
    #====================================================

    #Let's construct the data for the case with 0 jets:
    X_0jets, y_train_0jets, empty_0 = splitting_jets(X, y_train, df_empty, 0)
    X_val_0jets, y_val_0jets, weights_0jets_val = splitting_jets(
        X_val, y_val, weights_val, 0)
    X_test_0jets, y_test_0jets, weights_0jets_test = splitting_jets(
        X_test, y_test, weights_test, 0)

    #Let's construct the data for the case with 1 jets:
    X_1jet, y_train_1jet, empty_1 = splitting_jets(X, y_train, df_empty, 1)
    X_val_1jet, y_val_1jet, weights_1jet_val = splitting_jets(
        X_val, y_val, weights_val, 1)
    X_test_1jet, y_test_1jet, weights_1jet_test = splitting_jets(
        X_test, y_test, weights_test, 1)

    #Let's construct the data for the case with 2 jets:
    X_2jets, y_train_2jets, empty_2 = splitting_jets(X, y_train, df_empty, 2)
    X_val_2jets, y_val_2jets, weights_2jets_val = splitting_jets(
        X_val, y_val, weights_val, 2)
    X_test_2jets, y_test_2jets, weights_2jets_test = splitting_jets(
        X_test, y_test, weights_test, 2)

    del empty_0, empty_1, empty_2

    #====================================================
    #     2-JETS DNN
    #====================================================

    #Scaling data:
    standard_2jets = StandardScaler()
    standard_2jets.fit(X_2jets)
    X_2jets_standard = standard_2jets.transform(X_2jets)
    X_val_2jets_standard = standard_2jets.transform(X_val_2jets)
    X_test_2jets_standard = standard_2jets.transform(X_test_2jets)

    #DNN:
    np.random.seed(42)
    DNN_2jets = make_model([64, 128, 64, 64, 32, 8], 'relu', 0.1, 'Adam', 'L2',
                           0.0001, X_2jets.shape[-1])

    early_stopping = EarlyStopping(monitor='val_accuracy',
                                   min_delta=0,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)

    history = DNN_2jets.fit(X_2jets_standard,
                            y_train_2jets,
                            batch_size=256,
                            epochs=50,
                            verbose=1,
                            validation_data=(X_val_2jets_standard,
                                             y_val_2jets),
                            callbacks=[early_stopping],
                            class_weight=None)

    y_pred_2jets_val = DNN_2jets.predict(X_val_2jets_standard)
    y_pred_2jets_test = DNN_2jets.predict(X_test_2jets_standard)

    del X_2jets_standard, X_val_2jets_standard, X_2jets, X_val_2jets, X_test_2jets_standard, X_test_2jets

    #====================================================
    #     1-JET DNN
    #====================================================

    #Scaling data:
    standard_1jet = StandardScaler()
    standard_1jet.fit(X_1jet)
    X_1jet_standard = standard_1jet.transform(X_1jet)
    X_val_1jet_standard = standard_1jet.transform(X_val_1jet)
    X_test_1jet_standard = standard_1jet.transform(X_test_1jet)

    #DNN:
    np.random.seed(42)
    DNN_1jet = make_model([64, 64, 64, 32, 8], 'relu', 0.1, 'Adagrad', 'L1',
                          0.0001, X_1jet.shape[-1])

    early_stopping = EarlyStopping(monitor='val_accuracy',
                                   min_delta=0,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)

    history = DNN_1jet.fit(X_1jet_standard,
                           y_train_1jet,
                           batch_size=256,
                           epochs=50,
                           verbose=1,
                           validation_data=(X_val_1jet_standard, y_val_1jet),
                           callbacks=[early_stopping],
                           class_weight=None)

    y_pred_1jet_val = DNN_1jet.predict(X_val_1jet_standard)
    y_pred_1jet_test = DNN_1jet.predict(X_test_1jet_standard)

    del X_1jet_standard, X_val_1jet_standard, X_1jet, X_val_1jet, X_test_1jet_standard, X_test_1jet

    #====================================================
    #     0-JET DNN
    #====================================================

    #Scaling data:
    standard_0jets = StandardScaler()
    standard_0jets.fit(X_0jets)
    X_0jets_standard = standard_0jets.transform(X_0jets)
    X_val_0jets_standard = standard_0jets.transform(X_val_0jets)
    X_test_0jets_standard = standard_0jets.transform(X_test_0jets)

    #DNN:
    np.random.seed(42)
    DNN_0jets = make_model([32, 64, 128, 64, 32, 8], 'elu', 0.1, 'Adagrad',
                           'L1', 0.0001, X_0jets.shape[-1])

    early_stopping = EarlyStopping(monitor='val_accuracy',
                                   min_delta=0,
                                   patience=10,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)

    history = DNN_0jets.fit(X_0jets_standard,
                            y_train_0jets,
                            batch_size=256,
                            epochs=50,
                            verbose=1,
                            validation_data=(X_val_0jets_standard,
                                             y_val_0jets),
                            callbacks=[early_stopping],
                            class_weight=None)

    y_pred_0jets_val = DNN_0jets.predict(X_val_0jets_standard)
    y_pred_0jets_test = DNN_0jets.predict(X_test_0jets_standard)

    del X_0jets_standard, X_val_0jets_standard, X_0jets, X_val_0jets, X_test_0jets_standard, X_test_0jets

    #====================================================
    #     TOTAL AMS SCORE OF DNNs
    #====================================================

    #Total AMS score considering all the AMS of each subset:
    y_pred_DNN_val = np.concatenate(
        (y_pred_0jets_val, y_pred_1jet_val, y_pred_2jets_val))
    y_val_total = np.concatenate((y_val_0jets, y_val_1jet, y_val_2jets))
    weights_total_val = np.concatenate(
        (weights_0jets_val, weights_1jet_val, weights_2jets_val))

    y_pred_DNN_test = np.concatenate(
        (y_pred_0jets_test, y_pred_1jet_test, y_pred_2jets_test))
    y_test_total = np.concatenate((y_test_0jets, y_test_1jet, y_test_2jets))
    weights_total_test = np.concatenate(
        (weights_0jets_test, weights_1jet_test, weights_2jets_test))

    #====================================================
    #     COMBINING DNNs AND BDT AMS
    #====================================================

    dataset_blend_val = np.append(y_pred_DNN_val[:, 1].reshape(-1, 1),
                                  y_pred_BDT_val[:, 1].reshape(-1, 1),
                                  axis=1)
    dataset_blend_test = np.append(y_pred_DNN_test[:, 1].reshape(-1, 1),
                                   y_pred_BDT_test[:, 1].reshape(-1, 1),
                                   axis=1)
    blend = LogisticRegression(solver='lbfgs')
    blend.fit(dataset_blend_val, y_val_total[:, 1])
    blended_val = blend.predict_proba(dataset_blend_val)
    blended_test = blend.predict_proba(dataset_blend_test)

    #====================================================
    #     FINAL RESULTS
    #====================================================

    print('DNN:')
    plot_AMS(y_pred_DNN_test, y_test_total, weights_total_test)
    print('BDT:')
    plot_AMS(y_pred_BDT_test, y_test_total, weights_total_test)
    print('Combination:')
    plot_AMS(blended_test, y_test_total, weights_total_test)
    plt.legend(['DNN', 'BDT', 'DNN + BDT'])
    plt.ylim(2.8, )
    plt.savefig('AMS_total.png', dpi=300)
    plt.show()

    plot_distributions_final(blended_val, blended_test, y_val_total, 50, False,
                             weights_total_val, weights_total_test)
    plt.savefig('Final_distribution_unweighted.png', dpi=300)
    plt.show()

    plot_distributions_final(blended_val, blended_test, y_val_total, 50, True,
                             weights_total_val, weights_total_test)
    plt.savefig('Final_distribution_weighted.png', dpi=300)
    plt.show()
class Network():
    """Represent a network and let us operate on it.

    Currently only works for an MLP.
    """
    def __init__(
        self,
        nn_param_choices=None,
    ):
        self.accuracy = 0.
        self.nn_param_choices = nn_param_choices
        self.network_params = {}  # (dic): represents MLP network parameters
        self.model = None
        self.best_threshold = 0.5

    def compile_model(self, bFinal=False):
        # Get our network parameters.
        max_iter = 150 if bFinal else 60
        max_features = None if bFinal else "auto"
        self.best_threshold = 0.5
        #self.model = RandomForestClassifier(n_estimators=n_estimators, verbose=2)
        f_scorer = make_scorer(fbeta_score, beta=0.125)
        self.model = HistGradientBoostingClassifier(
            scoring=f_scorer,
            #learning_rate=0.1,max_bins=50, max_depth=3,n_iter_no_change=10,
            max_iter=max_iter,
            verbose=2)  #,
        #validation_fraction=0.08)

    def create_random(self):
        for key in self.nn_param_choices:
            self.network_params[key] = random.choice(
                self.nn_param_choices[key])

    def create_set(self, network):
        self.network_params = network

    def train(self, dataset_dict):
        if self.accuracy == 0.:
            self.accuracy = self.train_net(dataset_dict)

    def print_network(self):
        logging.info(self.network_params)
        logging.info("RF threshold: %.2f%%" % (self.best_threshold))
        logging.info("RF accuracy: %.2f%%" % (self.accuracy * 100))

    def update_best_threshold(self, y_val_proba, y_validation, y_train_proba,
                              y_train):
        self.best_threshold = 0.5
        best_fbeta_score_valid = 0
        best_fbeta_score_train = 0
        beta = 0.25
        for threshold in np.arange(0.5, 0.8, 0.001):
            y_val_pred = np.where(y_val_proba[:, 1] > threshold, 1, 0)
            # y_train_pred = np.where(y_train_proba[:, 1] > threshold, 1, 0)

            curr_validation_beta_score = fbeta_score(y_validation,
                                                     y_val_pred,
                                                     beta=beta)
            # curr_train_beta_score = fbeta_score(y_train, y_train_pred, beta=beta)

            if curr_validation_beta_score >= best_fbeta_score_valid:  # and curr_train_beta_score >= best_fbeta_score_train:

                best_fbeta_score_valid = curr_validation_beta_score
                # best_fbeta_score_train = curr_train_beta_score
                self.best_threshold = threshold

        header_note = "#" * 80
        print(header_note)
        print(f'#### improve thres:{self.best_threshold} With:')
        print(f'validation f-beta-{beta} score {best_fbeta_score_valid}')
        # print(f'train f-beta-{beta} score {best_fbeta_score_train}')
        print(header_note)

    def train_net(self, dataset_dict):
        self.compile_model(False)
        num_of_rows = self.network_params["Network_train_sample_size"]
        rows_index = np.random.choice(dataset_dict["X_train"].shape[0],
                                      size=num_of_rows,
                                      replace=False)
        print(f"train_net with param{self.network_params}")
        self.model.fit(dataset_dict["X_train"][rows_index, :],
                       dataset_dict["y_train"][rows_index])

        y_val_proba = self.model.predict_proba(dataset_dict["X_validation"])
        y_train_proba = self.model.predict_proba(
            dataset_dict["X_train"][rows_index, :])
        self.update_best_threshold(y_val_proba, dataset_dict["y_validation"],
                                   y_train_proba,
                                   dataset_dict["y_train"][rows_index])

        y_train_pred = np.where(y_train_proba[:, 1] > self.best_threshold, 1,
                                0)
        y_val_pred = np.where(y_val_proba[:, 1] > self.best_threshold, 1, 0)

        print(
            'Train accuracy',
            accuracy_score(dataset_dict["y_train"][rows_index], y_train_pred))
        print('Validation accuracy',
              accuracy_score(dataset_dict["y_validation"], y_val_pred))

        print(
            'Train precision',
            precision_score(dataset_dict["y_train"][rows_index], y_train_pred))
        print('Validation precision',
              precision_score(dataset_dict["y_validation"], y_val_pred))

        print('Train recall',
              recall_score(dataset_dict["y_train"][rows_index], y_train_pred))
        print('Validation recall',
              recall_score(dataset_dict["y_validation"], y_val_pred))

        print(
            'Train f-beta score',
            fbeta_score(dataset_dict["y_train"][rows_index],
                        y_train_pred,
                        beta=0.25))
        validation_beta_score = fbeta_score(dataset_dict["y_validation"],
                                            y_val_pred,
                                            beta=0.25)
        print(f'Validation f-beta score {validation_beta_score}')

        return validation_beta_score

    def train_final_net(self, dataset_dict):
        str_header = "#" * 80
        print(str_header)
        print(f"best RF.. train_final_net  with param{self.network_params}")
        print(str_header)
        self.compile_model(bFinal=True)
        self.model.fit(dataset_dict["X_train"], dataset_dict["y_train"])

        y_val_proba = self.model.predict_proba(dataset_dict["X_validation"])
        y_train_proba = self.model.predict_proba(dataset_dict["X_train"])
        self.update_best_threshold(y_val_proba, dataset_dict["y_validation"],
                                   y_train_proba, dataset_dict["y_train"])

        y_train_pred = np.where(y_train_proba[:, 1] > self.best_threshold, 1,
                                0)
        y_val_pred = np.where(y_val_proba[:, 1] > self.best_threshold, 1, 0)
        print(str_header)
        print(str_header)
        print('Train accuracy',
              accuracy_score(dataset_dict["y_train"], y_train_pred))
        print('Validation accuracy',
              accuracy_score(dataset_dict["y_validation"], y_val_pred))

        print('Train precision',
              precision_score(dataset_dict["y_train"], y_train_pred))
        print('Validation precision',
              precision_score(dataset_dict["y_validation"], y_val_pred))

        print('Train recall',
              recall_score(dataset_dict["y_train"], y_train_pred))
        print('Validation recall',
              recall_score(dataset_dict["y_validation"], y_val_pred))

        print('Train f-beta score',
              fbeta_score(dataset_dict["y_train"], y_train_pred, beta=0.25))
        validation_beta_score = fbeta_score(dataset_dict["y_validation"],
                                            y_val_pred,
                                            beta=0.25)
        print(f'Validation f-beta score {validation_beta_score}')
        print(str_header)
        print(str_header)
        self.accuracy = validation_beta_score
        return validation_beta_score

    def WriteModelToFile(self):
        print("save net to model")
        print("Network accuracy: %.2f%%" % (self.accuracy * 100))
        print(self.network_params)
        self.print_network()
        # TODO: use pickle
        # self.model.save("model.h5")

    def WriteResToFile(self, ds_class, file_name):
        """Train the model, return test loss.

        Args:
            network (dict): the parameters of the network
            dataset (str): Dataset to use for training/evaluating

        """
        print(f"Write tests results to File {file_name}..")

        y_test_pred = np.where(
            self.model.predict_proba(ds_class["X_test"])[:, 1] >
            self.best_threshold, 1, 0)
        np.savetxt(file_name,
                   y_test_pred.astype(int),
                   fmt='%i',
                   delimiter='\n')