Esempio n. 1
0
def xgb_diagnostics(X_train,
                    X_test,
                    y_train,
                    y_test,
                    params,
                    n_repeats=30,
                    plot_title=''):

    aucs_test, aucs_train = [], []
    for i in range(n_repeats):

        metric = 'aucpr'
        model = XGBClassifier(**params, random_state=i, n_jobs=-1)
        eval_set = [(X_train, y_train), (X_test, y_test)]
        model.fit(X_train,
                  y_train,
                  eval_metric=[metric],
                  eval_set=eval_set,
                  verbose=False)

        results = model.evals_result()
        test = results['validation_1'][metric]
        train = results['validation_0'][metric]
        aucs_test += [test[-1]]
        aucs_train += [train[-1]]

    return aucs_test
def xgb_clf(training, training_label, test, test_label):

    # Hyperparameters sat with inspiration from:
    # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    rounds = 400
    eta = 0.2  # 0.01-2
    max_depth = 7  # 3-10 default: 6
    gamma = 0.1  # default 0, but should be tuned

    # Evaluation
    eval_set = [(training, training_label), (test, test_label)]

    # Construct model and train
    model = XGBClassifier(seed=42,
                          eta=eta,
                          max_depth=max_depth,
                          gamma=gamma,
                          n_estimators=rounds,
                          verbose=False)
    model.fit(training,
              training_label,
              eval_metric='logloss',
              eval_set=eval_set)
    results = model.evals_result()
    model.fit(training,
              training_label,
              eval_metric='logloss',
              eval_set=eval_set,
              early_stopping_rounds=5,
              verbose=False)
    EPOCHS = len(results["validation_0"]['logloss'])

    # Saving evolution of metrics throughout the iterations
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(np.arange(0, EPOCHS),
            results["validation_0"]['logloss'],
            label="Train log loss",
            color='darkblue')
    ax.plot(np.arange(0, EPOCHS),
            results["validation_1"]['logloss'],
            label="Test log loss",
            color='darkorange')

    ax.plot([model.best_iteration, model.best_iteration], [0, 1], '--r')
    ax.set(
        xlabel="Iteration",
        ylabel=("Logarithmic loss"),
        #ylim=(-0.01, 1.01)
    )
    ax.grid(True)
    ax.legend()  #loc=(0.3,0.4))
    fig.savefig('evolutionXGB' + str(EPOCHS) + '.pdf')

    y_pred = model.predict(test)
    predictions = [round(value) for value in y_pred]
    acc = accuracy_score(test_label, predictions)

    return model, acc
def hyperopt_train_test(params):
    params['max_depth'] = int(params['max_depth'])
    xgb = XGBClassifier(**params)
    xgb.fit(train[X_vars],
            train[y_var],
            early_stopping_rounds=8,
            eval_metric='logloss',
            eval_set=[(train[X_vars], train[y_var]),
                      (valid[X_vars], valid[y_var])])
    return xgb.evals_result()['validation_1']['logloss'][-8]
Esempio n. 4
0
    def test_classifier(self):
        X_train = np.random.random((100, 28))
        y_train = np.random.randint(10, size=(100, 1))
        X_test = np.random.random((100, 28))
        y_test = np.random.randint(10, size=(100, 1))

        xgb1 = XGBClassifier(n_estimators=3, use_label_encoder=False)
        xgb1.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            eval_metric='mlogloss',
        )
        self.assertIn("validation_0", xgb1.evals_result())
Esempio n. 5
0
def performance(model: XGBClassifier):
    # retrieve performance metrics
    results = model.evals_result()
    epochs = len(results['validation_0']['logloss'])
    x_axis = range(0, epochs)
    # plot log loss
    fig, ax = plt.subplots()
    ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
    ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
    ax.legend()
    plt.ylabel('Log Loss')
    plt.title('XGBoost Log Loss')
    plt.show()

    print(model)
    plot_importance(model)
    plt.show()
Esempio n. 6
0
def XGB_learning(data, labels):
    data_train, data_test, labels_train, labels_test = \
        train_test_split(data, labels, test_size=0.2, random_state=7)

    # fit model no training data

    model = XGBClassifier()
    eval_set = [(data_train, labels_train), (data_test, labels_test)]
    model.fit(data_train, labels_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=False)

    labels_pred = model.predict(data_test)
    predictions = [round(value) for value in labels_pred]

    results = model.evals_result()
    accuracy = accuracy_score(labels_test, predictions)
    learn_curve = 1 - np.array(results['validation_1']['error'])
    # print('Accuracy = ', accuracy)

    return accuracy, learn_curve
Esempio n. 7
0
def final_xgb(X_train, y_train, X_test, y_test, scale_pos_weight, best_params,
              analysis):

    xgb = XGBClassifier(**best_params)
    xgb.set_params(njobs=4,
                   random_state=0,
                   objective='binary:logistic',
                   scale_pos_weight=scale_pos_weight)

    eval_set = [(X_train, y_train), (X_test, y_test)]
    eval_metric = ["error", "auc"]

    xgb.fit(X_train,
            y_train,
            eval_metric=eval_metric,
            eval_set=eval_set,
            verbose=0)

    results = xgb.evals_result()

    fig1, axes1 = plt.subplots(figsize=(10, 8), nrows=1, ncols=2)
    axes1[0].plot(results['validation_0']['error'], label='Train Error')
    axes1[0].plot(results['validation_1']['error'], label='Validation Error')
    axes1[0].set_title("Final XGBoost Error")
    axes1[0].set_xlabel("Iteration")
    axes1[0].set_ylabel("Error")
    axes1[0].legend()

    axes1[1].plot(results['validation_0']['auc'], label='Train AUC-ROC')
    axes1[1].plot(results['validation_1']['auc'], label='Validation AUC-ROC')
    axes1[1].set_title("Final XGBoost AUC-ROC")
    axes1[1].set_xlabel("Iteration")
    axes1[1].set_ylabel("AUC")
    axes1[1].legend()

    fig1.tight_layout()

    fig1.savefig(fig_dir + '/{}_final_xgb_model.png'.format(analysis),
                 format='png',
                 dpi=300,
                 transparent=False)

    return xgb
Esempio n. 8
0
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.1)

xgb.fit(x_train,
        y_train,
        verbose=True,
        eval_metric="rmse",
        eval_set=[(x_train, y_train), (x_test, y_test)],
        early_stopping_rounds=20)
#rmse,mae,logloss,error,auc

y_pre = xgb.predict(x_test)

acc = accuracy_score(y_test, y_pre)
score = xgb.score(x_test, y_test)
result = xgb.evals_result()
print(__file__)
print(result)
print("acc")
print(acc)
print("score")
print(score)

# import pickle #파이썬에서 제공한다
# pickle.dump(xgb,open("./model/xgb_save/cancer.plckle.dat","wb"))
import joblib

joblib.dump(xgb, "./model/xgb_save/cancer.joblib.dat")

print("start")
Esempio n. 9
0
    X_train, y_train, test_size=test_size, random_state=seed)

num_round = 100
bst = XGBClassifier(max_depth=2,
                    learning_rate=0.1,
                    n_estimators=num_round,
                    silent=True,
                    objective='binary:logistic')
eval_set = [(X_train_part, y_train_part), (X_validate, y_validate)]
bst.fit(X_train_part,
        y_train_part,
        eval_metric=["error", "logloss"],
        eval_set=eval_set,
        verbose=False)

results = bst.evals_result()
# epochs = len(results['validation_0']['error'])
x_axis = range(0, num_round)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()

# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
Esempio n. 10
0
# y = dataset.target

x, y = load_breast_cancer(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8,
                                                    random_state=66)

# model = XGBRegressor(n_estimators=5,learning_rate=0.1)
model = XGBClassifier(n_estimators=5,learning_rate=0.1)

# model.fit(x_train,y_train, verbose=True, eval_metric='error',eval_set=[(x_train, y_train), (x_test, y_test)])
model.fit(x_train,y_train, verbose=True, eval_metric='rmse',eval_set=[(x_train, y_train), (x_test, y_test)])

# rmse, mae, logloss, error, auc  // error이 acc라고?

result = model.evals_result() # 평가? 라고 생각
# print(f'result : {result}')

y_pred = model.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(f'acc1 : {acc}')

# score = model.score(x_test,y_test)
# print(f"r2 : {score}")

import pickle

pickle.dump(model,open('./model/xgb_save/cancer.pickle.dat','wb'))

print("save complete!!")
Esempio n. 11
0
class XGBooster(object):
    """
        The main class to train/encode/explain XGBoost models.
    """
    def __init__(self,
                 options,
                 from_data=None,
                 from_model=None,
                 from_encoding=None):
        """
            Constructor.
        """

        assert from_data or from_model or from_encoding, \
                'At least one input file should be specified'

        self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime
        self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime

        # saving command-line options
        self.options = options
        self.seed = self.options.seed
        np.random.seed(self.seed)

        if from_data:
            self.use_categorical = self.options.use_categorical
            # saving data
            self.data = from_data
            dataset = np.asarray(self.data.samps, dtype=np.float32)

            # split data into X and y
            self.feature_names = self.data.names[:-1]
            self.nb_features = len(self.feature_names)

            self.X = dataset[:, 0:self.nb_features]
            self.Y = dataset[:, self.nb_features]
            self.num_class = len(set(self.Y))
            self.target_name = list(range(self.num_class))

            param_dist = {
                'n_estimators': self.options.n_estimators,
                'max_depth': self.options.maxdepth
            }

            if (self.num_class == 2):
                param_dist['objective'] = 'binary:logistic'

            self.model = XGBClassifier(**param_dist)

            # split data into train and test sets
            self.test_size = self.options.testsplit
            if (self.test_size > 0):
                self.X_train, self.X_test, self.Y_train, self.Y_test = \
                        train_test_split(self.X, self.Y, test_size=self.test_size,
                                random_state=self.seed)
            else:
                self.X_train = self.X
                self.X_test = []  # need a fix
                self.Y_train = self.Y
                self.Y_test = []  # need a fix

            # check if we have info about categorical features
            if (self.use_categorical):
                self.categorical_features = from_data.categorical_features
                self.categorical_names = from_data.categorical_names
                self.target_name = from_data.class_names

                ####################################
                # this is a set of checks to make sure that we use the same as anchor encoding
                cat_names = sorted(self.categorical_names.keys())
                assert (cat_names == self.categorical_features)
                self.encoder = {}
                for i in self.categorical_features:
                    self.encoder.update(
                        {i: OneHotEncoder(categories='auto',
                                          sparse=False)})  #,
                    self.encoder[i].fit(self.X[:, [i]])

            else:
                self.categorical_features = []
                self.categorical_names = []
                self.encoder = []

            fname = from_data

        elif from_model:
            fname = from_model
            self.load_datainfo(from_model)
            if (self.use_categorical is
                    False) and (self.options.use_categorical is True):
                print(
                    "Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions"
                )
                exit()
            # load model

        elif from_encoding:
            fname = from_encoding

            # encoding, feature names, and number of classes
            # are read from an input file
            enc = SMTEncoder(None, None, None, self, from_encoding)
            self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \
                    self.num_class = enc.access()

        # create extra file names
        try:
            os.stat(options.output)
        except:
            os.mkdir(options.output)

        self.mapping_features()
        #################
        self.test_encoding_transformes()

        bench_name = os.path.splitext(os.path.basename(options.files[0]))[0]
        bench_dir_name = options.output + "/" + bench_name
        try:
            os.stat(bench_dir_name)
        except:
            os.mkdir(bench_dir_name)

        self.basename = (os.path.join(
            bench_dir_name, bench_name + "_nbestim_" +
            str(options.n_estimators) + "_maxdepth_" + str(options.maxdepth) +
            "_testsplit_" + str(options.testsplit)))

        data_suffix = '.splitdata.pkl'
        self.modfile = self.basename + '.mod.pkl'

        self.mod_plainfile = self.basename + '.mod.txt'

        self.resfile = self.basename + '.res.txt'
        self.encfile = self.basename + '.enc.txt'
        self.expfile = self.basename + '.exp.txt'

    def form_datefile_name(self, modfile):
        data_suffix = '.splitdata.pkl'
        return modfile + data_suffix

    def pickle_save_file(self, filename, data):
        try:
            f = open(filename, "wb")
            pickle.dump(data, f)
            f.close()
        except:
            print("Cannot save to file", filename)
            exit()

    def pickle_load_file(self, filename):
        try:
            f = open(filename, "rb")
            data = pickle.load(f)
            f.close()
            return data
        except:
            print("Cannot load from file", filename)
            exit()

    def save_datainfo(self, filename):

        print("saving  model to ", filename)
        self.pickle_save_file(filename, self.model)

        filename_data = self.form_datefile_name(filename)
        print("saving  data to ", filename_data)
        samples = {}
        samples["X"] = self.X
        samples["Y"] = self.Y
        samples["X_train"] = self.X_train
        samples["Y_train"] = self.Y_train
        samples["X_test"] = self.X_test
        samples["Y_test"] = self.Y_test
        samples["feature_names"] = self.feature_names
        samples["target_name"] = self.target_name
        samples["num_class"] = self.num_class
        samples["categorical_features"] = self.categorical_features
        samples["categorical_names"] = self.categorical_names
        samples["encoder"] = self.encoder
        samples["use_categorical"] = self.use_categorical

        self.pickle_save_file(filename_data, samples)

    def load_datainfo(self, filename):
        print("loading model from ", filename)
        self.model = XGBClassifier()
        self.model = self.pickle_load_file(filename)

        datafile = self.form_datefile_name(filename)
        print("loading data from ", datafile)
        loaded_data = self.pickle_load_file(datafile)
        self.X = loaded_data["X"]
        self.Y = loaded_data["Y"]
        self.X_train = loaded_data["X_train"]
        self.X_test = loaded_data["X_test"]
        self.Y_train = loaded_data["Y_train"]
        self.Y_test = loaded_data["Y_test"]
        self.feature_names = loaded_data["feature_names"]
        self.target_name = loaded_data["target_name"]
        self.num_class = loaded_data["num_class"]
        self.nb_features = len(self.feature_names)
        self.categorical_features = loaded_data["categorical_features"]
        self.categorical_names = loaded_data["categorical_names"]
        self.encoder = loaded_data["encoder"]
        self.use_categorical = loaded_data["use_categorical"]

    def train(self, outfile=None):
        """
            Train a tree ensemble using XGBoost.
        """

        return self.build_xgbtree(outfile)

    def encode(self, test_on=None):
        """
            Encode a tree ensemble trained previously.
        """

        encoder = SMTEncoder(self.model, self.feature_names, self.num_class,
                             self)
        self.enc, self.intvs, self.imaps, self.ivars = encoder.encode()

        if test_on:
            encoder.test_sample(np.array(test_on))

        encoder.save_to(self.encfile)

    def explain(self,
                sample,
                use_lime=False,
                use_anchor=False,
                use_shap=False,
                expl_ext=None,
                prefer_ext=False,
                nof_feats=5):
        """
            Explain a prediction made for a given sample with a previously
            trained tree ensemble.
        """

        if use_lime:
            expl = use_lime(self,
                            sample=sample,
                            nb_samples=5,
                            nb_features_in_exp=nof_feats)
        elif use_anchor:
            expl = use_anchor(self,
                              sample=sample,
                              nb_samples=5,
                              nb_features_in_exp=nof_feats,
                              threshold=0.95)
        elif use_shap:
            expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats)
        else:
            if 'x' not in dir(self):
                self.x = SMTExplainer(self.enc, self.intvs, self.imaps,
                                      self.ivars, self.feature_names,
                                      self.num_class, self.options, self)

            expl = self.x.explain(np.array(sample), self.options.smallest,
                                  expl_ext, prefer_ext)

        # returning the explanation
        return expl

    def validate(self, sample, expl):
        """
            Make an attempt to show that a given explanation is optimistic.
        """

        # there must exist an encoding
        if 'enc' not in dir(self):
            encoder = SMTEncoder(self.model, self.feature_names,
                                 self.num_class, self)
            self.enc, _, _, _ = encoder.encode()

        if 'v' not in dir(self):
            self.v = SMTValidator(self.enc, self.feature_names, self.num_class,
                                  self)

        # try to compute a counterexample
        return self.v.validate(np.array(sample), expl)

    def transform(self, x):
        if (len(x) == 0):
            return x
        if (len(x.shape) == 1):
            x = np.expand_dims(x, axis=0)
        if (self.use_categorical):
            assert (self.encoder != [])
            tx = []
            for i in range(self.nb_features):
                self.encoder[i].drop = None
                if (i in self.categorical_features):
                    tx_aux = self.encoder[i].transform(x[:, [i]])
                    tx_aux = np.vstack(tx_aux)
                    tx.append(tx_aux)
                else:
                    tx.append(x[:, [i]])
            tx = np.hstack(tx)
            return tx
        else:
            return x

    def transform_inverse(self, x):
        if (len(x) == 0):
            return x
        if (len(x.shape) == 1):
            x = np.expand_dims(x, axis=0)
        if (self.use_categorical):
            assert (self.encoder != [])
            inverse_x = []
            for i, xi in enumerate(x):
                inverse_xi = np.zeros(self.nb_features)
                for f in range(self.nb_features):
                    if f in self.categorical_features:
                        nb_values = len(self.categorical_names[f])
                        v = xi[:nb_values]
                        v = np.expand_dims(v, axis=0)
                        iv = self.encoder[f].inverse_transform(v)
                        inverse_xi[f] = iv
                        xi = xi[nb_values:]

                    else:
                        inverse_xi[f] = xi[0]
                        xi = xi[1:]
                inverse_x.append(inverse_xi)
            return inverse_x
        else:
            return x

    def transform_inverse_by_index(self, idx):
        if (idx in self.extended_feature_names):
            return self.extended_feature_names[idx]
        else:
            print("Warning there is no feature {} in the internal mapping".
                  format(idx))
            return None

    def transform_by_value(self, feat_value_pair):
        if (feat_value_pair in self.extended_feature_names.values()):
            keys = (list(self.extended_feature_names.keys())[list(
                self.extended_feature_names.values()).index(feat_value_pair)])
            return keys
        else:
            print(
                "Warning there is no value {} in the internal mapping".format(
                    feat_value_pair))
            return None

    def mapping_features(self):
        self.extended_feature_names = {}
        self.extended_feature_names_as_array_strings = []
        counter = 0
        if (self.use_categorical):
            for i in range(self.nb_features):
                if (i in self.categorical_features):
                    for j, _ in enumerate(self.encoder[i].categories_[0]):
                        self.extended_feature_names.update(
                            {counter: (self.feature_names[i], j)})
                        self.extended_feature_names_as_array_strings.append(
                            "f{}_{}".format(
                                i, j))  # str(self.feature_names[i]), j))
                        counter = counter + 1
                else:
                    self.extended_feature_names.update(
                        {counter: (self.feature_names[i], None)})
                    self.extended_feature_names_as_array_strings.append(
                        "f{}".format(i))  #(self.feature_names[i])
                    counter = counter + 1
        else:
            for i in range(self.nb_features):
                self.extended_feature_names.update(
                    {counter: (self.feature_names[i], None)})
                self.extended_feature_names_as_array_strings.append(
                    "f{}".format(i))  #(self.feature_names[i])
                counter = counter + 1

    def readable_sample(self, x):
        readable_x = []
        for i, v in enumerate(x):
            if (i in self.categorical_features):
                readable_x.append(self.categorical_names[i][int(v)])
            else:
                readable_x.append(v)
        return np.asarray(readable_x)

    def test_encoding_transformes(self):
        # test encoding

        X = self.X_train[[0], :]

        print("Sample of length", len(X[0]), " : ", X)
        enc_X = self.transform(X)
        print("Encoded sample of length", len(enc_X[0]), " : ", enc_X)
        inv_X = self.transform_inverse(enc_X)
        print("Back to sample", inv_X)
        print("Readable sample", self.readable_sample(inv_X[0]))
        assert ((inv_X == X).all())

        if (self.options.verb > 1):
            for i in range(len(self.extended_feature_names)):
                print(i, self.transform_inverse_by_index(i))
            for key, value in self.extended_feature_names.items():
                print(value, self.transform_by_value(value))

    def transfomed_sample_info(self, i):
        print(enc.categories_)

    def build_xgbtree(self, outfile=None):
        """
            Build an ensemble of trees.
        """

        if (outfile is None):
            outfile = self.modfile
        else:
            self.datafile = sefl.form_datefile_name(outfile)

        # fit model no training data

        if (len(self.X_test) > 0):
            eval_set = [(self.transform(self.X_train), self.Y_train),
                        (self.transform(self.X_test), self.Y_test)]
        else:
            eval_set = [(self.transform(self.X_train), self.Y_train)]

        print("start xgb")
        self.model.fit(
            self.transform(self.X_train),
            self.Y_train,
            eval_set=eval_set,
            verbose=self.options.verb)  # eval_set=[(X_test, Y_test)],
        print("end xgb")

        evals_result = self.model.evals_result()
        ########## saving model
        self.save_datainfo(outfile)
        print("saving plain model to ", self.mod_plainfile)
        self.model._Booster.dump_model(self.mod_plainfile)

        ensemble = TreeEnsemble(self.model,
                                self.extended_feature_names_as_array_strings,
                                nb_classes=self.num_class)

        y_pred_prob = self.model.predict_proba(
            self.transform(self.X_train[:10]))
        y_pred_prob_compute = ensemble.predict(
            self.transform(self.X_train[:10]), self.num_class)

        assert (np.absolute(y_pred_prob_compute - y_pred_prob).sum() <
                0.01 * len(y_pred_prob))

        ### accuracy
        try:
            train_accuracy = round(
                1 - evals_result['validation_0']['merror'][-1], 2)
        except:
            try:
                train_accuracy = round(
                    1 - evals_result['validation_0']['error'][-1], 2)
            except:
                assert (False)

        try:
            test_accuracy = round(
                1 - evals_result['validation_1']['merror'][-1], 2)
        except:
            try:
                test_accuracy = round(
                    1 - evals_result['validation_1']['error'][-1], 2)
            except:
                print("no results test data")
                test_accuracy = 0

        #### saving
        print("saving results to ", self.resfile)
        with open(self.resfile, 'w') as f:
            f.write("{} & {} & {} &{}  &{} & {} \\\\ \n \hline \n".format(
                os.path.basename(self.options.files[0]).replace("_", "-"),
                train_accuracy, test_accuracy, self.options.n_estimators,
                self.options.maxdepth, self.options.testsplit))
        f.close()

        print("Train accuracy: %.2f%%" % (train_accuracy * 100.0))
        print("Test accuracy: %.2f%%" % (test_accuracy * 100.0))

        return train_accuracy, test_accuracy, self.model
    is_final, args.model, args.scale, args.drop, args.remarks
    or args.neighbors)

if not args.final:
    eval_predicted_proba = model.predict_proba(eval_data)
    eval_predicted = model.predict(eval_data)
    # Splits into classes from 0-10 (11 classes)
    onehot = to_categorical(eval_labels).astype(int)
    eval_onehot = onehot[:, 1:]  # Trim unnecessary first column (class "0")
    ll = log_loss(eval_onehot, eval_predicted_proba)
    acc = accuracy_score(eval_labels, eval_predicted)
    print("Validation log-loss and accuracy: {:.5f} {:.5f}".format(ll, acc))

    # Plot
    if args.model in ["XGBoost"]:
        train_metrics = model.evals_result()['validation_0']
        test_metrics = model.evals_result()['validation_1']
        epochs = len(train_metrics['merror'])
        x_axis = range(0, epochs)
        # plot log loss
        fig, ax = plt.subplots()
        ax.plot(x_axis, train_metrics['mlogloss'], label='Train')
        ax.plot(x_axis, test_metrics['mlogloss'], label='Test')
        ax.legend()
        plt.ylabel('Log Loss')
        plt.title('{} - Log Loss'.format(args.model))
        plt.savefig("img/logloss_{}.png".format(uid))
        plt.show()
        # plot classification error
        fig, ax = plt.subplots()
        ax.plot(x_axis, train_metrics['merror'], label='Train')
currentDT = datetime.datetime.now()
print(currentDT.strftime("%I:%M:%S %p"))

X = x_train3[model_features]
y = x_train3.passed
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)


xgbc_model = XGBClassifier(n_estimators=2500, max_depth=6, learning_rate=.01, n_jobs=-1, cv=10)
xgbc_model.fit(X_train, y_train,
                    eval_set=[(X_valid, y_valid)],
                    eval_metric='logloss',
                    verbose=False)
prediction = xgbc_model.predict_proba(X_valid)
result = xgbc_model.evals_result()
xgbcloss = log_loss(y_valid, prediction)

print((time.time() - start_time)/60,': ', f'log loss: {xgbcloss:.3f}')

    
# Best so far: .334-.337 with n_est: 1450, learn_r: .02
# Best so far (7/14): .237-.241 with updated violations_score
# 35050, 0.18083770657518244
# 35100, 0.18083742930751404
# Best so far (7/30): .146-.148 with added one hot encode on extreme words, 
# .122-.124 with adjusted percentage of split
# Back to .243


# In[33]:
Esempio n. 14
0
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1)

xgb.fit(x_train,
        y_train,
        verbose=True,
        eval_metric=["logloss", "rmse", "auc"],
        eval_set=[(x_train, y_train), (x_test, y_test)],
        early_stopping_rounds=100)
#rmse,mae,logloss,error,auc

y_pre = xgb.predict(x_test)

r2 = r2_score(y_test, y_pre)
score = xgb.score(x_test, y_test)
results = xgb.evals_result()
print(__file__)
print(results)
print("r2")
print(r2)
print("score")
print(score)

fig, ax = plt.subplots()

epochs = len(results["validation_0"]["logloss"])
x_axis = range(epochs)
ax.plot(x_axis, results["validation_0"]["logloss"], label="Train")
ax.plot(x_axis, results["validation_1"]["logloss"], label="Test")
ax.legend()
Esempio n. 15
0
# eval set

from xgboost import XGBClassifier, XGBRegressor
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import r2_score, accuracy_score

x, y = load_wine(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66)

model = XGBClassifier(n_estimators=500, learning_rate=0.01, n_jobs=8, eval_metric='mlogloss')

model.fit(x_train, y_train, verbose=1, eval_set=[(x_train, y_train),(x_test,y_test)])

aaa = model.score(x_test, y_test)
print(aaa)

y_pred = model.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print("acc :",acc)
print("==============================")

results = model.evals_result()
print(results)

# 1.0
# acc : 1.0
def train_and_generate_model():

    #global log_fd

    global log_fd_opt

    global tr_input_arr

    global tr_angle_arr

    global val_input_arr

    global val_angle_arr

    data_len = len(exchange_rates)

    log_fd_tr = open("./train_progress_log_" +
                     dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                     mode="w")

    # inner logger function for backtest

    def logfile_writeln_tr(log_str):

        nonlocal log_fd_tr

        log_fd_tr.write(log_str + "\n")

        log_fd_tr.flush()

    print("data size of rates: " + str(data_len))

    print("num of rate datas for tarin: " +
          str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR))

    print("input features sets for tarin: " + str(COMPETITION_TRAIN_DATA_NUM))

    logfile_writeln_tr("data size of rates: " + str(data_len))

    logfile_writeln_tr("num of rate datas for tarin: " +
                       str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR))

    tr_input_mat = []

    tr_angle_mat = []

    is_loaded_input_mat = False

    if os.path.exists("./tr_input_mat.pickle"):

        with open('./tr_input_mat.pickle', 'rb') as f:

            tr_input_mat = pickle.load(f)

        with open('./tr_angle_mat.pickle', 'rb') as f:

            tr_angle_mat = pickle.load(f)

        is_loaded_input_mat = True

    else:

        for i in range(DATA_HEAD_ASOBI,
                       len(exchange_rates) - DATA_HEAD_ASOBI - OUTPUT_LEN,
                       SLIDE_IDX_NUM_AT_GEN_INPUTS_AND_COLLECT_LABELS):

            tr_input_mat.append([
                exchange_rates[i],
                (exchange_rates[i] - exchange_rates[i - 1]) /
                exchange_rates[i - 1],
                get_rsi(exchange_rates, i),
                get_ma(exchange_rates, i),
                get_ma_kairi(exchange_rates, i),
                get_bb_1(exchange_rates, i),
                get_bb_2(exchange_rates, i),
                get_ema(exchange_rates, i),
                get_ema_rsi(exchange_rates, i),
                get_cci(exchange_rates, i),
                get_mo(exchange_rates, i),
                get_lw(exchange_rates, i),
                get_ss(exchange_rates, i),
                get_dmi(exchange_rates, i),
                get_vorarity(exchange_rates, i),
                get_macd(exchange_rates, i),
                str(judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i]))
            ])

            tr_input_mat.append([
                reverse_exchange_rates[i],
                (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) /
                reverse_exchange_rates[i - 1],
                get_rsi(reverse_exchange_rates, i),
                get_ma(reverse_exchange_rates, i),
                get_ma_kairi(reverse_exchange_rates, i),
                get_bb_1(reverse_exchange_rates, i),
                get_bb_2(reverse_exchange_rates, i),
                get_ema(reverse_exchange_rates, i),
                get_ema_rsi(reverse_exchange_rates, i),
                get_cci(reverse_exchange_rates, i),
                get_mo(reverse_exchange_rates, i),
                get_lw(reverse_exchange_rates, i),
                get_ss(reverse_exchange_rates, i),
                get_dmi(reverse_exchange_rates, i),
                get_vorarity(reverse_exchange_rates, i),
                get_macd(reverse_exchange_rates, i),
                str(
                    judge_chart_type(
                        reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i]))
            ])

            tmp = exchange_rates[i + OUTPUT_LEN] - exchange_rates[i]

            if tmp >= 0:

                tr_angle_mat.append(1)

            else:

                tr_angle_mat.append(0)

            tmp = reverse_exchange_rates[
                i + OUTPUT_LEN] - reverse_exchange_rates[i]

            if tmp >= 0:

                tr_angle_mat.append(1)

            else:

                tr_angle_mat.append(0)

        if is_loaded_input_mat == False:

            with open('tr_input_mat.pickle', 'wb') as f:

                pickle.dump(tr_input_mat, f)

            with open('tr_angle_mat.pickle', 'wb') as f:

                pickle.dump(tr_angle_mat, f)

    #log output for tensorboard

    #configure("logs/xgboost_trade_cpu_1")

    tr_input_arr = np.array(tr_input_mat[0:COMPETITION_TRAIN_DATA_NUM])

    tr_angle_arr = np.array(tr_angle_mat[0:COMPETITION_TRAIN_DATA_NUM])

    watchlist = None

    split_idx = COMPETITION_TRAIN_DATA_NUM + int(
        (len(tr_input_mat) - COMPETITION_TRAIN_DATA_NUM) *
        VALIDATION_DATA_RATIO)

    if VALIDATION_DATA_RATIO != 0.0:

        val_input_arr = np.array(
            tr_input_mat[COMPETITION_TRAIN_DATA_NUM:split_idx])

        val_angle_arr = np.array(
            tr_angle_mat[COMPETITION_TRAIN_DATA_NUM:split_idx])

        watchlist = [(tr_input_arr, tr_angle_arr),
                     (val_input_arr, val_angle_arr)]

    else:

        watchlist = [(tr_input_arr, tr_angle_arr)]

    start = time.time()

    if is_param_tune_with_optuna:

        log_fd_opt = open("./tune_progress_log_" +
                          dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                          mode="w")

        study = None

        if is_use_db_at_tune:

            study = optuna.Study(study_name='fxsystrade',
                                 storage='sqlite:///../fxsystrade.db')

        else:

            study = optuna.create_study()

        parallel_num = RAPTOP_THREAD_NUM * 2

        if is_colab_cpu or is_exec_at_mba:

            parallel_num = COLAB_CPU_AND_MBA_THREAD_NUM * 2

        if special_optuna_parallel_num != -1:

            parallel_num = special_optuna_parallel_num

        study.optimize(opt, n_trials=OPTUNA_TRIAL_NUM, n_jobs=parallel_num)

        process_time = time.time() - start

        logfile_writeln_opt("best_params: " + str(study.best_params))

        logfile_writeln_opt("best_value: " + str(study.best_value))

        logfile_writeln_opt("best_trial: " + str(study.best_trial))

        logfile_writeln_opt("excecution time of tune: " + str(process_time))

        log_fd_opt.flush()

        log_fd_opt.close()

        exit()

    param = {}

    n_thread = RAPTOP_THREAD_NUM

    if is_use_gpu:

        param['tree_method'] = 'gpu_hist'

        param['max_bin'] = 16

        param['gpu_id'] = 0

        n_thread = COLAB_CPU_AND_MBA_THREAD_NUM

    if is_colab_cpu or is_exec_at_mba:

        n_thread = COLAB_CPU_AND_MBA_THREAD_NUM

    logfile_writeln_tr("training parameters are below...")

    logfile_writeln_tr(str(param))

    eval_result_dic = {}

    logfile_writeln_tr("num_round: " + str(NUM_ROUND))

    clf = XGBClassifier(max_depth=MAX_DEPTH,
                        random_state=42,
                        n_estimators=NUM_ROUND,
                        min_child_weight=18,
                        subsample=0.9,
                        colsample_bytree=0.6,
                        eta=ETA,
                        objective='binary:logistic',
                        verbosity=0,
                        n_thread=n_thread,
                        **param)

    verbosity = True

    if is_use_gpu or is_colab_cpu:

        verbosity = False

    clf.fit(tr_input_arr, tr_angle_arr, eval_set=watchlist, verbose=verbosity)

    process_time = time.time() - start

    logfile_writeln_tr("excecution time of training: " + str(process_time))

    clf.save_model('./xgb.model')

    booster = clf.get_booster()

    booster.dump_model('./xgb_model.raw.txt')

    eval_result_dic = clf.evals_result()

    for ii in range(len(eval_result_dic['validation_0']['error'])):

        if VALIDATION_DATA_RATIO != 0.0:

            logfile_writeln_tr(
                str(ii) + "," +
                str(eval_result_dic['validation_0']['error'][ii]) + "," +
                str(eval_result_dic['validation_1']['error'][ii]))

        else:

            logfile_writeln_tr(
                str(ii) + "," +
                str(eval_result_dic['validation_0']['error'][ii]))

    # Feature Importance

    fti = clf.feature_importances_

    logfile_writeln_tr('Feature Importances:')

    for i, feat in enumerate(FEATURE_NAMES):

        logfile_writeln_tr('\t{0:20s} : {1:>.6f}'.format(feat, fti[i]))

    log_fd_tr.flush()

    log_fd_tr.close()

    print("finished training and saved model.")
    #     # fpr, tpr, _ = roc_curve(train.target, pred)
    #     fpr, tpr, sara = roc_curve(train.target, pred)
    #     plt.plot(fpr, tpr, label='BDT', color='b')
    #
    #     plt.legend(loc='best')
    #     plt.grid()
    #     plt.title('ROC')
    #     plt.tight_layout()
    #     plt.savefig('results/roc_train_%s.pdf' %(tag))

    ##########################################################################################
    #####   OVERTRAINING SCORE
    ##########################################################################################
    plt.clf()

    auc_train = clf.evals_result()['validation_0']['auc']
    auc_test = clf.evals_result()['validation_1']['auc']

    n_estimators = np.arange(len(auc_train))

    plt.plot(n_estimators, auc_train, color='r', label='AUC train')
    plt.plot(n_estimators, auc_test, color='b', label='AUC test')

    plt.xlabel('# tree')
    plt.ylabel('Area Under ROC')

    plt.xscale('log')
    plt.grid()

    # plt.xlim([1, 1000])
    # plt.ylim([0.985, 1.0])
Esempio n. 18
0
def main():

  # Start timer
  t_start = time.time()

  # Command line options
  parser = argparse.ArgumentParser()
  group_model = parser.add_mutually_exclusive_group() 
  group_model.add_argument('-x', '--xgboost', action='store_true', help='Run gradient BDT')
  group_model.add_argument('-n', '--nn', action='store_true', help='Run neural network')
  group_model.add_argument('-p', '--prepare_hdf5', type=str, nargs='?', default='', help='Prepare input datasets for ML and store in HDF5 file; options: "2L2J" or "2L3J+"')
  group_read_dataset = parser.add_mutually_exclusive_group() 
  group_read_dataset.add_argument('-r', '--read_hdf5', action='store_true', help='Read prepared datasets from HDF5 file')
  #group_read_dataset.add_argument('-d', '--direct_read', action='store_true', help='Read unprepared datasets from ROOT file')
  parser.add_argument('-l', '--load_pretrained_model', action='store_true', help='Load pre-trained classifier model, i.e. only run on test data')
  #parser.add_argument('-B', '--N_sig_events', type=lambda x: int(float(x)), default=0, help='Number of signal events to read from the dataset')
  #parser.add_argument('-S', '--N_bkg_events', type=lambda x: int(float(x)), default=0, help='Number of background events to read from the dataset for each class')
  parser.add_argument('-s', '--signal_region', type=str, nargs='?', default='int', help='Choose signal region: low-2J, int-2J, high-2J, low-3J+, int-3J+, high-3J+')
  parser.add_argument('-b', '--balanced', type=int, nargs='?', default=-1, help='Balance dataset for training; 0: oversample signal, 1: undersample background')
  parser.add_argument('-m', '--multiclass', action='store_true', help='Use multiple background classes in addition to the signal class')
  parser.add_argument('-w', '--event_weight', action='store_true', help='Apply event weights during training')
  parser.add_argument('-c', '--class_weight', action='store_true', help='Apply class weights to account for unbalanced dataset')
  parser.add_argument('-t', '--do_train', action='store_true', help='Train the classifier')
  parser.add_argument('-T', '--do_test', action='store_true', help='Test the classifier on data it has not been trained on')
  parser.add_argument('-e', '--train_even', action='store_true', help='Use even run numbers for training and odd run numbers for testing')
  parser.add_argument('-o', '--train_odd', action='store_true', help='Use odd run numbers for training and even run numbers for testing')
  parser.add_argument('-C', '--doCV', action='store_true', help='Perform a k-fold cross-validation on the training set during training')
  parser.add_argument('-O', '--oversample', action='store_true', help='Balance imbalanced dataset using oversampling')
  parser.add_argument('-U', '--undersample', action='store_true', help='Balance imbalanced dataset using undersampling')
  parser.add_argument('--n_nodes', type=int, nargs='?', default=20, help='Number of nodes in each hidden neural network layer')
  parser.add_argument('--n_hidden_layers', type=int, nargs='?', default=1, help='Number of nodes in each hidden neural network layer')
  parser.add_argument('--dropout', type=float, nargs='?', default=0., help='Use dropout regularization on neural network layers to reduce overfitting')
  parser.add_argument('--L1', type=float, nargs='?', default=0., help='Use L1 regularization on neural network weights to reduce overfitting')
  parser.add_argument('--L2', type=float, nargs='?', default=0., help='Use L2 regularization (weights decay) on neural network weights to reduce overfitting')
  parser.add_argument('--lr', type=float, nargs='?', default=0.001, help='Set learning rate for the neural network or BDT optimizer')
  parser.add_argument('--batch_size', type=int, nargs='?', default=32, help='Number of events to use for each weight update')
  parser.add_argument('--epochs', type=lambda x: int(float(x)), nargs='?', default=1, help='Number of passes through the training set')
  parser.add_argument('--max_depth', type=int, nargs='?', default=3, help='Maximum tree depth for BDT')
  parser.add_argument('--n_estimators', type=lambda x: int(float(x)), nargs='?', default=100, help='Number of trees in BDT ensemble')
  parser.add_argument('--gamma', type=float, nargs='?', default=0, help='Minimum loss reduction required to make a further partition on a leaf node of the XGBoost tree')
  parser.add_argument('--min_child_weight', type=float, nargs='?', default=1, help='Minimum sum of instance weight(hessian) needed in a child')
  parser.add_argument('--max_delta_step', type=float, nargs='?', default=0, help='Maximum delta step we allow each tree’s weight estimation to be')
  parser.add_argument('--subsample', type=float, nargs='?', default=1, help='Subsample ratio of the training instance')
  parser.add_argument('--colsample_bytree', type=float, nargs='?', default=1, help='Subsample ratio of columns when constructing each tree')
  parser.add_argument('--colsample_bylevel', type=float, nargs='?', default=1, help='Subsample ratio of columns for each level')
  parser.add_argument('--colsample_bynode', type=float, nargs='?', default=1, help='Subsample ratio of columns for each node')
  parser.add_argument('-G', '--doGridSearchCV', action='store_true', help='Perform a grid search for optimal hyperparameter values using cross-validation')
  parser.add_argument('-V', '--plot_validation_curve', action='store_true', help='Calculate and plot perforance score as function of number of training events')
  parser.add_argument('-L', '--plot_learning_curve', action='store_true', help='Calculate and plot perforance score for different values of a chosen hyperparameter')
  args = parser.parse_args()

  # Set which sample types to prepare HDF5s for
  use_sig = 1
  use_bkg = 1
  use_data = 0

  # Where to put preprocessed datasets
  preproc_dir = 'preprocessed_datasets/'
  preproc_suffix = ''
  if args.prepare_hdf5:
    preproc_suffix = '_group_{}_preprocessed.h5'.format(args.prepare_hdf5)
  elif '2J' in args.signal_region:
    preproc_suffix = '_group_2L2J_preprocessed.h5'
  elif '3J+' in args.signal_region:
    preproc_suffix = '_group_2L3J+_preprocessed.h5'
  filename_sig_low_preprocessed = preproc_dir + 'sig_low' + preproc_suffix
  filename_sig_int_preprocessed = preproc_dir + 'sig_int' + preproc_suffix
  filename_sig_high_preprocessed = preproc_dir + 'sig_high' + preproc_suffix
  filename_sig_preprocessed = filename_sig_low_preprocessed
  filename_bkg_preprocessed = preproc_dir + 'bkg' + preproc_suffix
  filename_data_preprocessed = preproc_dir + 'data' + preproc_suffix

  # Where to put output
  output_dir = 'output/'
  #trained_model_dir = 'trained_models/'
  trained_model_dir = output_dir
  trained_model_xgb_suffix = '2LJets_trained_model.joblib'
  trained_model_nn_suffix = '2LJets_trained_model.h5'

  # Counters
  n_events_read = n_events_kept = 0
  n_events_read_sample = n_events_kept_sample = 0
  n_events_read_sample_type = n_events_kept_sample_type = 0

  if args.xgboost:
    output_dir += 'xgboost/latest/xgb_'
    trained_model_dir += 'xgboost/latest/xgb_'
  elif args.nn:
    output_dir += 'neural_network/latest/nn_'
    trained_model_dir += 'neural_network/latest/nn_'

  if 'low' in args.signal_region:
    output_dir += 'low_'
    trained_model_dir += 'low_'
  elif 'int' in args.signal_region:
    output_dir += 'int_'
    trained_model_dir += 'int_'
  elif 'high' in args.signal_region:
    output_dir += 'high_'
    trained_model_dir += 'high_'

  if args.train_even:
    output_dir += 'trainEven_'
    trained_model_dir += 'trainEven_'
  elif args.train_odd:
    output_dir += 'trainOdd_'
    trained_model_dir += 'trainOdd_'

  if args.xgboost:
    trained_model_path = trained_model_dir + trained_model_xgb_suffix
  elif args.nn:
    trained_model_path = trained_model_dir + trained_model_nn_suffix

  global df_sig_feat, df_bkg_feat, df_data_feat

  l_sig = []
  if use_sig:
    if 'low' in args.signal_region:
      l_sig = d_sig['low']
      filename_sig_preprocessed = filename_sig_low_preprocessed
    elif 'int' in args.signal_region:
    #elif args.signal_region == 'int':
      l_sig = d_sig['int']
      filename_sig_preprocessed = filename_sig_int_preprocessed
    elif 'high' in args.signal_region:
      l_sig = d_sig['high']
      filename_sig_preprocessed = filename_sig_high_preprocessed

    d_sig_infile = {'low': filename_sig_low_preprocessed, 
                    'int': filename_sig_int_preprocessed, 
                    'high': filename_sig_high_preprocessed}

  class Logger(object):
      def __init__(self):
          self.terminal = sys.stdout
          self.log = open(output_dir+".log", "w")

      def write(self, message):
          self.terminal.write(message)
          self.log.write(message)  

      def flush(self):
          #this flush method is needed for python 3 compatibility.
          #this handles the flush command by doing nothing.
          #you might want to specify some extra behavior here.
          pass    

  sys.stdout = Logger()

  if args.prepare_hdf5:
    """Read input dataset in chunks, select features and perform cuts,
    before storing DataFrame in HDF5 file"""

    # Prepare and store signal dataset
    if use_sig:
      prepareHDF5(filename_sig_low_preprocessed, d_sig['low'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)
      prepareHDF5(filename_sig_int_preprocessed, d_sig['int'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)
      prepareHDF5(filename_sig_high_preprocessed, d_sig['high'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)

    # Prepare and store background dataset
    if use_bkg:
      prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e6, n_chunks=None, entrystart=0)
      #prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e4, n_chunks=1, entrystart=0)

    # Prepare and store real dataset
    if use_data:
      prepareHDF5(filename_data_preprocessed, l_data, sample_type='data', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)

    return

  elif args.read_hdf5:

    if use_sig:
      # Read in preprocessed signal DataFrame from HDF5 file
      df_sig_feat = pd.DataFrame({})

      for key_sig, value_sig_infile in d_sig_infile.items():
        if key_sig in args.signal_region:
          print("\nReading in file:", value_sig_infile)
          sig_store = pd.HDFStore(value_sig_infile)
          for i_sig in sig_store.keys(): #d_sig[key_sig]:
            if len(df_sig_feat) is 0:
              df_sig_feat = sig_store[i_sig]#.astype('float64')
              df_sig_feat['group'] = i_sig
            else:
              df_sig_sample = sig_store[i_sig]#.astype('float64')
              df_sig_sample['group'] = i_sig
              df_sig_feat = df_sig_feat.append(df_sig_sample)

      if 'mTl3' in df_sig_feat:
        df_sig_feat.drop(columns='mTl3', inplace=True)

      print("\ndf_sig_feat.head():\n", df_sig_feat.head())
      sig_store.close()
      print("Closed store")

    if use_bkg:
      # Read in preprocessed background DataFrame from HDF5 file
      df_bkg_feat = pd.DataFrame({})

      print("\nReading in file:", filename_bkg_preprocessed)
      bkg_store = pd.HDFStore(filename_bkg_preprocessed)
      for i_bkg in bkg_store.keys(): #l_bkg:
        if len(df_bkg_feat) is 0:
          df_bkg_feat = bkg_store[i_bkg]#.astype('float64')
          df_bkg_feat['group'] = i_bkg
        else:
          df_bkg_sample = bkg_store[i_bkg]#.astype('float64')
          df_bkg_sample['group'] = i_bkg
          df_bkg_feat = df_bkg_feat.append(df_bkg_sample)

      if 'mTl3' in df_bkg_feat:
        df_bkg_feat.drop(columns='mTl3', inplace=True)

      print("\ndf_bkg_feat.head():\n", df_bkg_feat.head())
      bkg_store.close()
      print("Closed store")

    if use_data:
      # Read in preprocessed DataFrame of real data from HDF5 file
      data_store = pd.HDFStore(filename_data_preprocessed)
      df_data_feat = data_store['data']
      print("\ndf_data_feat.head():\n", df_data_feat.head())
      data_store.close()
      print("Closed store")

  elif args.direct_read:
    """Read the input dataset for direct use, without reading in chunks
    and storing to output file"""

    print("Not available at the moment")
    return

    #entry_start = 0
    #sig_entry_stop = 1e4
    #bkg_entry_stop = 1e4

    ## import signal dataset
    #df_sig = importOpenData(sample_type="sig", entrystart=entry_start, entrystop=sig_entry_stop)
    #df_sig = shuffle(df_sig)  # shuffle the rows/events
    #df_sig_feat = selectFeatures(df_sig, l_features)
    #df_sig_feat = df_sig_feat*1  # multiplying by 1 to convert booleans to integers
    #df_sig_feat["eventweight"] = getEventWeights(df_sig, l_eventweights)

    ## import background dataset
    #df_bkg = importOpenData(sample_type="bkg", entrystart=entry_start, entrystop=bkg_entry_stop)
    #df_bkg = shuffle(df_bkg)  # shuffle the rows/events
    #df_bkg_feat = selectFeatures(df_bkg, l_features)
    #df_bkg_feat = df_bkg_feat*1  # multiplying by 1 to convert booleans to integers
    #df_bkg_feat["eventweight"] = getEventWeights(df_bkg, l_eventweights)

    ## import data
    ##df_data = importOpenData(sample_type="data", entrystart=entry_start, entrystop=entry_stop)

  if 'low' in args.signal_region:
    print('\nBefore xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head())
    df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] = df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] * 0.08836675497457203
    print('\nAfter xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head())

  # Preselection cuts
  l_presel = ['met_Sign > 2', 'mt2leplsp_0 > 10']
  #df_sig_feat.query('&'.join(l_presel), inplace=True)

  print("\n======================================")
  print("df_sig_feat.shape =", df_sig_feat.shape)
  print("df_bkg_feat.shape =", df_bkg_feat.shape)
  print("======================================")

  # make array of features
  df_X = pd.concat([df_bkg_feat, df_sig_feat], axis=0)#, sort=False)

  print("\ndf_X.isna().sum().sum()", df_X.isna().sum().sum())

  #print("\ndf_X.dtypes", df_X.dtypes)
  #col_float32 = (df_X.dtypes == 'float32').values
  #df_X.iloc[:, col_float32] = df_X.iloc[:, col_float32].astype('float64')
  #print("\nAfter converting all columns to float64:\ndf_X.dtypes", df_X.dtypes)

  # make array of labels
  y_bkg = np.zeros(len(df_bkg_feat))
  y_sig = np.ones(len(df_sig_feat))
  y = np.concatenate((y_bkg, y_sig), axis=0).astype(int)
  df_X['ylabel'] = y

  if args.multiclass:
    df_X.loc[df_X.group=='Zjets', 'ylabel'] = 2
    df_X.loc[df_X.group=='diboson', 'ylabel'] = 3
    df_X = df_X.query('group=="diboson" | group=="Zjets" | ylabel==1')
    Y = df_X.ylabel
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    # convert integers to dummy variables (i.e. one hot encoded)
    y_multi = np_utils.to_categorical(encoded_Y)

  # Split the dataset in train and test sets
  test_size = 0.5
  seed = 42

  df_X_even = df_X.query("RandomRunNumber % 2 == 0")
  df_X_odd  = df_X.query("RandomRunNumber % 2 == 1")

  df_X_even = shuffle(df_X_even)
  df_X_odd = shuffle(df_X_odd)

  if args.train_even:
    X_train = df_X_even
    X_test = df_X_odd
  elif args.train_odd:
    X_train = df_X_odd
    X_test = df_X_even

  # Balance dataset by resampling: equal number of signal and background events
  if args.balanced >= 0:
    # Oversample signal
    if args.balanced is 0:
      N_train_sig = len(X_train.query('ylabel==0'))
    # Undersample background
    elif args.balanced is 1:
      N_train_sig = len(X_train.query('ylabel==1'))
    N_train_bkg = N_train_sig
    # Draw balanced training datasets where the number of signal and background events are equal
    X_train_sig = resample(X_train.query('ylabel==1'), replace=True, n_samples=N_train_sig, random_state=42)#, stratify=None)
    X_train_bkg = resample(X_train.query('ylabel==0'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None)
    X_train = pd.concat([X_train_bkg, X_train_sig], axis=0)

  print("\n---------- After balancing ----------")
  print("args.balanced =", args.balanced)
  print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==1').shape)
  print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==0').shape)
  print("---------------------------------------")

  #X_train_bkg = resample(X_train.query('group==Zjets'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None)
  #X_train = X_train.query('group=="diboson" | ylabel==1')

  # Draw validation set as subsample of test set, for quicker evaluation of validation loss during training
  n_val_samples = 1e5
  X_val = resample(X_test, replace=False, n_samples=n_val_samples, random_state=42, stratify=X_test.ylabel)
  y_val = X_val.ylabel

  y_train = X_train.ylabel
  y_test = X_test.ylabel

  # Making a copy of the DFs with only feature columns
  X_train_feat_only = X_train.copy()
  X_test_feat_only = X_test.copy()
  X_val_feat_only = X_val.copy()
  l_non_features = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel']
  X_train_feat_only.drop(l_non_features, axis=1, inplace=True)
  X_test_feat_only.drop(l_non_features, axis=1, inplace=True)
  X_val_feat_only.drop(l_non_features, axis=1, inplace=True)

  print("\nX_train_feat_only:", X_train_feat_only.columns)
  print("X_test_feat_only:", X_test_feat_only.columns)
  print("X_val_feat_only:", X_val_feat_only.columns)

  print("\nX_train_feat_only:", X_train_feat_only.shape)
  print("X_test_feat_only:", X_test_feat_only.shape)
  print("X_val_feat_only:", X_val_feat_only.shape)

  # Feature scaling
  # Scale all variables to the interval [0,1]
  #scaler = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
  scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
  print("\nscaler.fit_transform(X_train_feat_only)")
  X_train_scaled = scaler.fit_transform(X_train_feat_only)
  print("scaler.transform(X_test_feat_only)")
  X_test_scaled = scaler.transform(X_test_feat_only)
  print("scaler.transform(X_val_feat_only)")
  X_val_scaled = scaler.transform(X_val_feat_only)

  
  print("\n\n//////////////////// ML part ////////////////////////")

  global model
  scale_pos_weight = 1
  event_weight = None
  class_weight = None
  class_weight_dict = {}

  if args.event_weight:
    event_weight = X_train.eventweight
    #event_weight = eventweight_train_resampled

  if args.class_weight:
    if args.xgboost:
      # XGBoost: Scale signal events up by a factor n_bkg_train_events / n_sig_train_events
      scale_pos_weight = len(X_train[X_train.ylabel == 0]) / len(X_train[X_train.ylabel == 1]) 
      #scale_pos_weight = 10
    else:
      # sciki-learn: Scale overrespresented sample down (bkg) and underrepresented sample up (sig)
      class_weight = "balanced"
  else:
    class_weight = None

  print("\n# bkg train events / # sig train events = {0:d} / {1:d}".format(len(X_train[X_train.ylabel == 0]), len(X_train[X_train.ylabel == 1])))
  print("scale_pos_weight =", scale_pos_weight)

  classes = np.unique(y)
  class_weight_vect = compute_class_weight(class_weight, classes, y)
  class_weight_dict = {0: class_weight_vect[0], 1: class_weight_vect[1]}

  # Initialize variables for storing CV output
  valid_score = test_score = fit_time = score_time = 0
  # Initialize variables for storing validation and learning curve output
  train_scores_vc_mean = train_scores_vc_std = 0
  valid_scores_vc_mean = valid_scores_vc_std = 0
  train_scores_lc_mean = train_scores_lc_std = 0
  valid_scores_lc_mean = valid_scores_lc_std = 0

  # List of training set sizes for plotting of learning curve
  train_sizes = [0.5, 0.75, 1.0]

  # List of parameter values for hyperparameter grid search
  # XGBoost
  max_depth = [5, 6, 8, 10]
  n_estimators = [50, 100, 200, 500, 1000]
  learning_rate = [0.001, 0.01, 0.1, 0.5, 1.0]
  reg_alpha = [0, 0.001, 0.01, 0.1, 1.]
  reg_lambda = [0, 0.001, 0.01, 0.1, 1.]

  d_param_grid_xgb = {'max_depth': max_depth,
                      'n_estimators': n_estimators,
                      'learning_rate': learning_rate,
                      'reg_alpha': reg_alpha,
                      'reg_lambda': reg_lambda
                      }

  # Specify one of the above parameter lists to plot validation curve for
  param_name_xgb = 'max_depth'
  param_range_xgb = d_param_grid_xgb[param_name_xgb]

  # Neural network
  n_hidden_layers = [1, 3, 5, 7, 10]
  n_nodes = [10, 20, 50, 100, 500]
  batch_size = [8, 16, 32, 64, 128]
  epochs = [10, 50, 100, 500, 1000]
  #kernel_regularizer = [l1_l2(l1=1e-6, l2=1e-6), l1_l2(l1=1e-6, l2=1e-5), l1_l2(l1=1e-5, l2=1e-6), l1_l2(l1=1e-5, l2=1e-5)]
  d_param_grid_nn = {'n_hidden_layers': [1] #n_hidden_layers,
                     #'n_nodes': #n_nodes,
                     #'batch_size': batch_size,
                     #'epochs': epochs,
                     #'kernel_regularizer': kernel_regularizer
                    }

  # Specify one of the above parameter lists to plot validation curve for
  param_name_nn = 'n_hidden_layers'
  param_range_nn = d_param_grid_nn[param_name_nn]

  if args.xgboost:
    param_range = param_range_xgb
    param_name = param_name_xgb
  elif args.nn:
    param_range = param_range_nn
    param_name = param_name_nn


  # Run XGBoost BDT
  if args.xgboost:

    if args.multiclass:
      objective = 'multi:softmax'
      eval_metric = 'mlogloss'
    else:
      objective = 'binary:logistic'
      eval_metric = 'logloss'
      #eval_metric = 'auc'

    max_depth = args.max_depth
    lr = args.lr
    n_estimators = args.n_estimators
    gamma = args.gamma
    min_child_weight = args.min_child_weight
    max_delta_step = args.max_delta_step
    subsample = args.subsample
    colsample_bytree = args.colsample_bytree
    colsample_bylevel = args.colsample_bylevel
    colsample_bynode = args.colsample_bynode
    reg_alpha = args.L1
    reg_lambda = args.L2

    if not args.load_pretrained_model:
      model = XGBClassifier(max_depth=max_depth, 
                            learning_rate=lr,
                            n_estimators=n_estimators, 
                            verbosity=1,
                            objective=objective, 
                            n_jobs=-1,
                            gamma=gamma,
                            min_child_weight=min_child_weight,
                            max_delta_step=max_delta_step,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            colsample_bylevel=colsample_bylevel,
                            colsample_bynode=colsample_bynode,
                            reg_alpha=reg_alpha,  # L1 regularization
                            reg_lambda=reg_alpha, # L2 regularization
                            scale_pos_weight=scale_pos_weight)

      print("\nmodel.get_params()\n", model.get_params())

      if not args.plot_validation_curve and not args.plot_learning_curve:

        if args.doGridSearchCV:
          model = GridSearchCV(model, d_param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
    
        print("\nTraining XGBoost BDT...")

        if args.doCV:

          cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, return_train_score=True)

          valid_score = cv_results['test_score']
          train_score = cv_results['train_score']
          fit_time = cv_results['fit_time']
          score_time = cv_results['score_time']
          fit_time = cv_results['fit_time']

        else:
          model.fit(X_train_scaled, y_train, 
                    sample_weight=event_weight, 
                    eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)],
                    #eval_set=[(X_val_scaled, y_val)],
                    eval_metric=eval_metric,
                    early_stopping_rounds=20,
                    verbose=True)

          evals_result = model.evals_result()
          sns.set()
          ax = sns.lineplot(x=range(0, len(evals_result['validation_0'][eval_metric])), y=evals_result['validation_0'][eval_metric], label='Training loss')
          ax = sns.lineplot(x=range(0, len(evals_result['validation_1'][eval_metric])), y=evals_result['validation_1'][eval_metric], label='Validation loss')
          ax.set(xlabel='Epochs', ylabel='Loss')
          plt.show()

        print("\nTraining done!")

        if args.doGridSearchCV:
          joblib.dump(model.best_estimator_, trained_model_path)
        else:
          joblib.dump(model, trained_model_path)
        print("\nSaving the trained XGBoost BDT:", trained_model_path)

    elif args.load_pretrained_model:
      print("\nReading in pre-trained XGBoost BDT:", trained_model_path)
      model = joblib.load(trained_model_path)


  # Run neural network
  elif args.nn:

    n_inputs = X_train_scaled.shape[1]
    n_nodes = args.n_nodes
    n_hidden_layers = args.n_hidden_layers
    dropout_rate = args.dropout
    batch_size = args.batch_size
    epochs = args.epochs
    l1 = args.L1
    l2 = args.L2
    lr = args.lr

    if not args.load_pretrained_model:
      print("\nBuilding and training neural network")

      es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

      model = KerasClassifier(build_fn=create_model,
                              n_inputs=n_inputs,
                              n_hidden_layers=n_hidden_layers,
                              n_nodes=n_nodes,
                              dropout_rate=dropout_rate,
                              l1=l1,
                              l2=l2,
                              lr=lr,
                              batch_size=batch_size, 
                              epochs=epochs, 
                              verbose=1,
                              )

      if not args.plot_validation_curve and not args.plot_learning_curve:

        if args.doGridSearchCV:
          param_grid = d_param_grid_nn
          model = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

        history = model.fit(X_train_scaled, y_train, 
                            sample_weight=event_weight, 
                            class_weight=class_weight_dict,
                            verbose=1,
                            callbacks=[es],
                            validation_data=(X_val_scaled, y_val)
                            #validation_data=(X_test_scaled, y_test)
                            )

        print("\nmodel.model.summary()\n", model.model.summary())

        if not args.doGridSearchCV:
          d_val_loss = {'Training loss': history.history['loss'], 'Validation loss': history.history['val_loss']}
          df_val_loss = pd.DataFrame(d_val_loss)

          sns.set()
          ax = sns.lineplot(data=df_val_loss)
          ax.set(xlabel='Epochs', ylabel='Loss')
          plt.show()

        if args.doGridSearchCV:
          model.best_estimator_.model.save(trained_model_path)
        else:
          model.model.save(trained_model_path)
        print("\nSaving the trained neural network:", trained_model_path)

    elif args.load_pretrained_model:
      print("\nReading in pre-trained neural network:", trained_model_path)
      model = load_model(trained_model_path)

  if not args.plot_validation_curve and not args.plot_learning_curve:

    # Print results of grid search
    if args.doGridSearchCV:
      print("Best parameters set found on development set:")
      print("")
      print("model.best_params_", model.best_params_)
      print("")
      print("Grid scores on development set:")
      means = model.cv_results_['mean_test_score']
      stds = model.cv_results_['std_test_score']
      for mean, std, params in zip(means, stds, model.cv_results_['params']):
          print("{0:0.3f} (+/-{1:0.03f}) for {2!r}".format(mean, std, params))
      print("")
      df = pd.DataFrame.from_dict(model.cv_results_)
      print("pandas DataFrame of cv results")
      print(df)
      print("")

    # Get predicted signal probabilities for train and test sets
    output_train = model.predict_proba(X_train_scaled)
    output_test = model.predict_proba(X_test_scaled)
    #X_train = X_train.copy()
    #X_test = X_test.copy()

    if args.multiclass:
      output_test = output_test.reshape(output_test.shape[0], 3)
      print("output_train", len(output_train[0]))

      for i_output in range(len(output_train[0])):
        X_train["output"+str(i_output)] = output_train[:,i_output]
        X_test["output"+str(i_output)] = output_test[:,i_output]

    elif output_train.shape[1] is 2:
      print("output_train[:10,1]", output_train[:10,1])
      X_train["output"] = output_train[:,1]
      X_test["output"] = output_test[:,1]

    else:
      X_train["output"] = output_train
      X_test["output"] = output_test


    print("\n\n//////////////////// Plotting part ////////////////////////\n")

    if not args.multiclass:
      print("len(X_train.query('ylabel==0').loc[:,'eventweight'])", len(X_train.query('ylabel==0').loc[:,'eventweight']))
      print("len(X_train.query('ylabel==0').loc[:,'output'])", len(X_train.query('ylabel==0').loc[:,'output']))
      print("X_train.query('ylabel==0').loc[:,'eventweight']", X_train.query("ylabel==0").loc[:,"eventweight"].head())
      print("X_train.query('ylabel==0').loc[:,'output']", X_train.query("ylabel==0").loc[:,"output"].head())

      print("X_train[['eventweight', 'output']].min(): \n", X_train[['eventweight', 'output']].min())
      print("X_train[['eventweight', 'output']].max(): \n", X_train[['eventweight', 'output']].max())
    
    l_X_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg]
    l_ew_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg]
    l_X_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg]
    l_ew_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg]

    l_X_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig]
    l_ew_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig]
    l_X_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig]
    l_ew_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig]

    d_X_train_bkg = dict(zip(l_bkg, l_X_train_bkg))
    d_ew_train_bkg = dict(zip(l_bkg, l_ew_train_bkg))
    d_X_test_bkg = dict(zip(l_bkg, l_X_test_bkg))
    d_ew_test_bkg = dict(zip(l_bkg, l_ew_test_bkg))

    # Plot unweighted training and test output
    #plt.figure(1)
    #plotTrainTestOutput(d_X_train_bkg, None,
    #                    X_train.query("ylabel==1").loc[:,"output"], None,
    #                    d_X_test_bkg, None,
    #                    X_test.query("ylabel==1").loc[:,"output"], None)
    #plotTrainTestOutput(d_X_train_bkg, None,
    #                    X_train.query("ylabel==1").loc[:,"output"], None,
    #                    d_X_test_bkg, None,
    #                    X_test.query("ylabel==1").loc[:,"output"], None)
    #plt.savefig(output_dir + 'hist1_train_test_unweighted.pdf')

    # Plot weighted train and test output, with test set multiplied by 2 to match number of events in training set
    plt.figure()
    #for i_output in range(output_train.shape[1]):
    plotTrainTestOutput(d_X_train_bkg, d_ew_train_bkg,
                        X_train.query("ylabel==1").filter(like='output'), X_train.query("ylabel==1").loc[:,"eventweight"],
                        d_X_test_bkg, d_ew_test_bkg,
                        X_test.query("ylabel==1").filter(like='output'), X_test.query("ylabel==1").loc[:,"eventweight"],
                        args.signal_region)
    plt.savefig(output_dir + 'hist_train_test_weighted_comparison.pdf')

    # Plot final signal vs background estimate for test set, scaled to 10.6/fb
    if 'low' in args.signal_region:
      plt.figure()
      plotFinalTestOutput(d_X_test_bkg,
                          d_ew_test_bkg,
                          X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").filter(like='output'),
                          X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").loc[:,"eventweight"],
                          args.signal_region,
                          figure_text='(200, 100) GeV')
      plt.savefig(output_dir + 'hist_test_392330_396210_C1N2_WZ_2L2J_200_100_weighted.pdf')
    elif 'int' in args.signal_region:
      plt.figure()
      plotFinalTestOutput(d_X_test_bkg,
                          d_ew_test_bkg,
                          X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"output"],
                          X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"eventweight"],
                          args.signal_region,
                          figure_text='(500, 200) GeV')
      plt.savefig(output_dir + 'hist_test_392325_C1N2_WZ_2L2J_500_200_weighted.pdf')
    elif 'high' in args.signal_region:
      plt.figure()
      plotFinalTestOutput(d_X_test_bkg,
                          d_ew_test_bkg,
                          X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"output"],
                          X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"eventweight"],
                          args.signal_region,
                          figure_text='(600, 0) GeV')
      plt.savefig(output_dir + 'hist5_test_392356_C1N2_WZ_2L2J_600_0_weighted.pdf')


    if args.xgboost and not args.doGridSearchCV:
      # Plot feature importance
      print("model.feature_importances_", model.feature_importances_)
      print("np.sum(model.feature_importances_)", np.sum(model.feature_importances_))
      if args.multiclass:
        l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output0', 'output1', 'output2']
      else:
        l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output']
      s_feat_importance = pd.Series(model.feature_importances_, index=X_train.drop(l_feat_drop, axis=1).columns)
      print("X_train.drop(l_feat_drop, axis=1).columns\n", X_train.drop(l_feat_drop, axis=1).columns)
      s_feat_importance.sort_values(ascending=False, inplace=True)

      plt.figure()
      sns.set(style="ticks", color_codes=True)
      n_top_feat_importance = 20
      ax = sns.barplot(x=s_feat_importance[:n_top_feat_importance]*100, y=s_feat_importance[:n_top_feat_importance].index)#, palette="Blues_r")
      #ax.set_yticklabels(s_feat_importance.index)
      ax.set(xlabel="Feature importance [%]")
      plt.savefig(output_dir + 'feature_importance.pdf')


    if not args.multiclass:
      # Plot ROC curve
      fpr, tpr, thresholds = metrics.roc_curve(X_test.loc[:,"ylabel"], X_test.loc[:,"output"])
      auc = metrics.roc_auc_score(X_test.loc[:,"ylabel"], X_test.loc[:,"output"])

      plt.figure()
      ax = sns.lineplot(x=tpr, y=1-fpr, estimator=None, label='ROC curve: AUC = %0.2f' % auc)
      plt.plot([1,0], [0,1], linestyle="--")
      ax.set(xlabel="Signal efficiency", ylabel="Background efficiency")
      plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_1minBkgEff.pdf')

      plt.figure()
      ax = sns.lineplot(x=tpr, y=1/(fpr), estimator=None, label='ROC curve: AUC = %0.2f' % auc)
      #plt.plot([0,1], [0,1], linestyle="--")
      ax.set(xlabel="Signal efficiency", ylabel="Background rejection = 1/(1 - bkg eff.)", yscale='log')
      plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_bkgRej.pdf')


    plt.show()


    # Signal significance
    print("\n///////////////// Signal significance /////////////////")

    def significance(cut_string_sig, cut_string_bkg, rel_unc=0.3):
      sig_exp = np.sum(X_test.query("ylabel == 1 & "+cut_string_sig).loc[:,"eventweight"])
      bkg_exp = np.sum(X_test.query("(ylabel == 0 | ylabel == 2 | ylabel == 3) & "+cut_string_bkg).loc[:,"eventweight"])
      Z_N_exp = RooStats.NumberCountingUtils.BinomialExpZ(sig_exp, bkg_exp, rel_unc)
      return [sig_exp, bkg_exp, Z_N_exp]

    #cut_string_DSID = 'DatasetNumber == {0:d}'.format(dsid)
    if 'low' in args.signal_region: 
      key = '(200, 100)'
      cut_string_DSID = '(DatasetNumber == 392330 | DatasetNumber == 396210)'
    elif 'int' in args.signal_region: 
      key = '(500, 200)'
      cut_string_DSID = 'DatasetNumber == 392325'
    elif 'high' in args.signal_region: 
      key = '(600, 0)'
      cut_string_DSID = 'DatasetNumber == 392356'

    l_cuts = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
    global cut_optimal
    cut_optimal = 0
    Z_N_optimal = 0
    for cut in l_cuts:

      if args.multiclass:
        cut_string_SR = 'output0 > {:f}'.format(cut)
      else:
        cut_string_SR = 'output > {:f}'.format(cut)
      cut_string_bkg = cut_string_SR
      cut_string_sig = cut_string_SR + " & " + cut_string_DSID
      print('\ncut_string_sig:', cut_string_sig)
      print('cut_string_bkg:', cut_string_bkg)

      [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3)
      print("---", key)
      print("S_exp =", sig_exp)
      print("B_exp =", bkg_exp)
      for i in range(len(l_X_train_bkg)):
        l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg]
        B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"])
        print("  {0}: {1}".format(l_bkg[i], B_exp_i))
      print("Z_N_exp =", Z_N_exp)

      if sig_exp >= 3 and bkg_exp >= 1:
        if Z_N_exp > Z_N_optimal:
          Z_N_optimal = Z_N_exp
          cut_optimal = cut

    # Print the optimal SR values
    if args.multiclass:
      cut_string_SR = 'output0 > {:f}'.format(cut_optimal)
    else:
      cut_string_SR = 'output > {:f}'.format(cut_optimal)
    cut_string_bkg = cut_string_SR
    cut_string_sig = cut_string_SR + " & " + cut_string_DSID
    print('\ncut_string_sig:', cut_string_sig)
    print('cut_string_bkg:', cut_string_bkg)


    [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3)
    print("---", key)
    print("Optimal cut =", cut_optimal)
    print("S_exp =", sig_exp)
    print("B_exp =", bkg_exp)
    for i in range(len(l_X_train_bkg)):
      l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg]
      B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"])
      print("  {0}: {1}".format(l_bkg[i], B_exp_i))
    print("Z_N_exp =", Z_N_exp)



  if args.plot_validation_curve:
    print("\nCalculating validation curve...")
    train_scores, valid_scores = validation_curve(model, X_train_scaled, y_train, 
                                                  param_name=param_name, param_range=param_range,
                                                  cv=3, 
                                                  scoring='roc_auc', 
                                                  n_jobs=-1,
                                                  verbose=11)

    train_scores_vc_mean = np.mean(train_scores, axis=1)
    train_scores_vc_std = np.std(train_scores, axis=1)
    valid_scores_vc_mean = np.mean(valid_scores, axis=1)
    valid_scores_vc_std = np.std(valid_scores, axis=1)
 
    # Plot validation curves
    figF, axsF = plt.subplots()
    # Training score
    axsF.plot( param_range, train_scores_vc_mean, 'o-', label="Training score", color="darkorange", lw=2)
    axsF.fill_between( param_range, train_scores_vc_mean - train_scores_vc_std, train_scores_vc_mean + train_scores_vc_std, alpha=0.2, color="darkorange", lw=2)
    # Test score
    axsF.plot( param_range, valid_scores_vc_mean, 'o-', label="Cross-validation score", color="navy", lw=2)
    axsF.fill_between( param_range, valid_scores_vc_mean - valid_scores_vc_std, valid_scores_vc_mean + valid_scores_vc_std, alpha=0.2, color="navy", lw=2)
    axsF.set_xlabel(param_name)
    axsF.set_ylabel('Score')
    axsF.legend(loc="best")
    axsF.set_title('Validation curves')
    #axsF.set_ylim(0., 1.)
    plt.savefig(output_dir + 'validation_curve_{}.pdf'.format(param_name))
    plt.show()

  if args.plot_learning_curve:
    print("\nCalculating learning curve...")
    train_sizes, train_scores, valid_scores = learning_curve(model, X_train_scaled, y_train, train_sizes=train_sizes,
                                                             cv=3, scoring='roc_auc', n_jobs=1, verbose=3)
    train_scores_lc_mean = np.mean(train_scores, axis=1)
    train_scores_lc_std = np.std(train_scores, axis=1)
    valid_scores_lc_mean = np.mean(valid_scores, axis=1)
    valid_scores_lc_std = np.std(valid_scores, axis=1)

    # Plot learning curves
    figG, axsG = plt.subplots()
    # 68% CL bands
    #if runBDT:
    #elif runNN:
    axsG.fill_between( train_sizes, train_scores_lc_mean - train_scores_lc_std, train_scores_lc_mean + train_scores_lc_std, alpha=0.2, color="r", lw=2)
    axsG.fill_between( train_sizes, valid_scores_lc_mean - valid_scores_lc_std, valid_scores_lc_mean + valid_scores_lc_std, alpha=0.2, color="g", lw=2)
    # Training and validation scores
    axsG.plot( train_sizes, train_scores_lc_mean, 'o-', label="Training score", color="r", lw=2)
    axsG.plot( train_sizes, valid_scores_lc_mean, 'o-', label="Cross-validation score", color="g", lw=2)
    axsG.set_xlabel("Training examples")
    axsG.set_ylabel('Score')
    axsG.legend(loc="best")
    axsG.set_title('Learning curves')
    #axsG.set_ylim(0., 1.)
    plt.savefig(output_dir + 'learning_curve.pdf')
    plt.show()


  # Stop timer
  t_end = time.time()
  print("\nProcess time: {:4.2f} s".format(t_end - t_start))
Esempio n. 19
0
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "n_estimators": 1000,
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2,
                                                      10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1e-1, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical(
            "grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical(
            "sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical(
            "normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop",
                                                 1e-8,
                                                 1.0,
                                                 log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop",
                                                 1e-8,
                                                 1.0,
                                                 log=True)

    xgb = XGBClassifier(**param)

    xgb.fit(
        X_train,
        y_train.to_numpy().reshape(-1),
        early_stopping_rounds=50,
        eval_set=[
            (X_train, y_train.to_numpy().reshape(-1)),
            (X_valid, y_valid.to_numpy().reshape(-1)),
        ],
        eval_metric=EVAL_METRIC,
        verbose=True,
    )

    results = xgb.evals_result()
    best_iteration = xgb.best_iteration
    print(f"Best Iteration: {best_iteration}")
    res = {
        eval_name:
        {key: val[xgb.best_iteration]
         for key, val in values.items()}
        for eval_name, values in results.items()
    }
    logger.info(res)

    # accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    auc = res["validation_1"]["auc"]

    return auc
Esempio n. 20
0
Y_tt = Y_tt.astype(int)
Y_tp = Y_tp.astype(int)
Y_tt    = T2.inverse_transform(Y_tt);
Y_tp    = T2.inverse_transform(Y_tp);



print("\n-------------------- Classification report for PART 2 (XGBoost) -----------------------\n")
print(classification_report(Y_tt,Y_tp,digits=3)[0:57])
print(classification_report(Y_tt,Y_tp,digits=3)[-175:])
print("\n---------------------------------------------------------------------------------------\n")



plt.rcParams["figure.figsize"] = (5,4);
RES = MODEL2.evals_result();
NUM = len(RES['validation_0']['merror']);
G1  = range(0, NUM);
GRAPH, G2 = plt.subplots();
G2.plot(G1, RES['validation_0']['mlogloss'], label='Log Loss');
G2.plot(G1, RES['validation_0']['merror']  , label='Error');
G2.legend();
plt.ylabel('Error/Log Loss value');
plt.xlabel('Epochs')
plt.title('XGBoost Error and Loss Values');
plt.show();


plt.rcParams["figure.figsize"] = (10,10);
cm1 = confusion_matrix(Y_tt,Y_tp);
labelsi = np.unique(Y_tt)
Esempio n. 21
0
def extract_xgboost_eval(model: xgboost.XGBClassifier) -> pd.DataFrame:
    df = pd.DataFrame(model.evals_result()["validation_0"])
    df["iteration"] = [i + 1 for i in range(df.shape[0])]
    df["eval_set"] = "val"
    df["model"] = "xgboost"
    return df
Esempio n. 22
0
for thresh in thresholds:  # 칼럼 수 만큼 돈다!
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)

    # select_y_train = selection.transform(y_train)
    # print(select_x_train.shape)
    # print(type(select_x_train))
    # print(type(y_train))
    selection_model = XGBClassifier(n_estimators=5, n_jobs=-1)

    selection_model.fit(select_x_train,
                        y_train,
                        verbose=True,
                        eval_metric=['error', 'logloss'],
                        eval_set=[(select_x_train, y_train),
                                  (select_x_test, y_test)],
                        early_stopping_rounds=100)

    results = selection_model.evals_result()
    # print("eval's result: ", results)

    y_predict = selection_model.predict(select_x_test)

    score = r2_score(y_test, y_predict)

    print("Thresh=%.3f, n=%d, R2: %.2f%%" %
          (thresh, select_x_train.shape[1], score * 100.0))
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# fit model no training data
model = XGBClassifier()
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['error'], label='Train')
ax.plot(x_axis, results['validation_1']['error'], label='Test')
ax.legend()
Esempio n. 24
0
                                                    train_size=0.8,
                                                    random_state=1)

model = XGBClassifier(n_estimators=1000, learning_rate=0.1)

# model.fit(x_train, y_train, verbose=True,  eval_metric= "error",
#                 eval_set=[(x_train, y_train), (x_test, y_test)])
model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=["logloss", "loss"],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)
# rmse, mae, logloss, error, auc

result = model.evals_result()
print(result)

y_pred = model.predict(x_test)

r2 = r2_score(y_pred, y_test)
print(f"r2: {r2}")

thresholds = np.sort(model.feature_importances_)
print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    parameter = {
        'n_estimators': [100, 200, 400],
Esempio n. 25
0
class Xgboost(object):
    def __init__(self,
                 task="cla",
                 module_type="performance",
                 compute_task="cpu",
                 **params):
        """
        :param task:
        :param module_type:
        :param compute_task:
        :param params:
        """
        assert task in ["cla", "reg"]
        assert module_type in ["debug", "performance", "balance"]
        assert compute_task in ["cpu", "gpu"]

        self.task = task
        self.module_type = module_type  # 模块
        if self.module_type == "debug":
            params["n_jos"] = 1
        elif self.module_type == "performance":
            params["n_jos"] = cpu_count()  # cpu核心数
        else:  # 性能模式
            params["n_jos"] = cpu_count() // 2
        self.compute_task = compute_task

        if self.compute_task == "gpu":  # 使用gpu
            params["tree_method"] = "gpu_hist"
        else:  # 默认cpu
            params["tree_method"] = "hist"  # 使用的cpu

        if self.task == "reg":  # 做回归任务
            self.model = XGBRegressor(
                learning_rate=params.get("learning_rate", 0.3),
                n_estimators=params.get("n_estimators", 100),  # 树的个数100,即代数
                max_depth=params.get("max_depth", 6),  # 树的深度
                min_child_weight=params.get("min_child_weight", 1),  # 叶子节点最小权重
                n_jobs=params.get("n_jos", None),  # 线程数
                gamma=params.get("gamma", 0),  # 惩罚项中叶子节点个数前的参数
                reg_lambda=params.get("lambda", 1),  # lambda
                reg_alpha=params.get("alpha", 0),
                tree_method=params.get("tree_method", "auto"),
                subsample=params.get("subsample", 1),  # 随机选择100%样本建立决策树
                colsample_bytree=1,  # 随机选择80%特征建立决策树
                objective=params.get("objective",
                                     "reg:squarederror"),  # 指定损失函数
                # num_class=params.get("num_class", 2),  # 不指定即为2分类
                booster=params.get("booster", "gbtree"),  # 使用的提升器
                scale_pos_weight=1,  # 解决样本不平衡问题
                random_state=27,  # 随机数
            )

        else:  # 做的分类任务
            self.model = XGBClassifier(
                learning_rate=params.get("learning_rate", 0.3),
                n_estimators=params.get("n_estimators", 100),  # 树的个数100,即代数
                max_depth=params.get("max_depth", 6),  # 树的深度
                min_child_weight=params.get("min_child_weight", 1),  # 叶子节点最小权重
                n_jobs=params.get("n_jos", None),  # 线程数
                gamma=params.get("gamma", 0),  # 惩罚项中叶子节点个数前的参数
                reg_lambda=params.get("lambda", 1),  # lambda
                reg_alpha=params.get("alpha", 0),
                tree_method=params.get("tree_method", "auto"),  # 树方法, 默认为auto
                subsample=params.get("subsample", 1),  # 随机选择100%样本建立决策树
                colsample_bytree=1,  # 随机选择80%特征建立决策树
                objective=params.get("objective", "multi:softmax"),
                # 指定损失函数   # 'binary:logistic   二分类交叉上

                # num_class=params.get("num_class", 2),  # 不指定即为2分类
                booster=params.get("booster", "gbtree"),  # 使用的提升器
                scale_pos_weight=1,  # 解决样本不平衡问题
                random_state=27,  # 随机数
            )
        """
        目标函数类型
        具体查看  https://xgboost.readthedocs.io/en/latest/parameter.html
        obejctive:  默认  reg:squarederror:
        reg:squarederror:  #回归平方误差
        reg:squaredlogerror  # 上述误差上取对数
        reg:logistic logistic regression
        reg:logistic    逻辑回归
        binary:logistic    逻辑回归二分类, 输出为概率值
        binary:logitraw    逻辑回归 2分类,输出为logits之前的得分
        binary:hinge   用于二元分类的铰链损失。这使得预测为0或1,而不是产生概率。
        multi:softmax:  多分类,需要指定num_class的类别
        multi:softprob:  输出为概率  ndata*nclass 的矩阵,即,每行数据为分属类别的概率
        """

    def train(self,
              x_train,
              y_train=None,
              sample_weight=None,
              base_margin=None,
              eval_set=None,
              eval_metric=None,
              early_stopping_rounds=None,
              verbose=True,
              sample_weight_eval_set=None):
        # print(self.model)
        """
        :param x_train:     回归中,使用特征矩阵,  array
        :param y_train:      标签  array
        :param eval_metric
        :return:
        """
        # 默认开启过早停止

        # eval_metric in ["rmse","rmsle","mae","logloss","error","error@t", "merror","mlogloss","auc","aucpr",
        #                 "ndcg","map","ndcg@n", "map@n","ndcg-", "map-", "ndcg@n-", "map@n-","poisson-nloglik",
        #                 "gamma-nloglik","cox-nloglik","gamma-deviance","tweedie-nloglik","aft-nloglik"]
        # eval_metric   参数可为字符串, 也可以是列表字符串的形式

        if eval_metric:  # 若需要使用评估模型模式,
            assert eval_set  # 要确保   测试集是存在的。

        self.model.fit(X=x_train,
                       y=y_train,
                       sample_weight=sample_weight,
                       base_margin=base_margin,
                       eval_set=eval_set,
                       eval_metric=eval_metric,
                       early_stopping_rounds=early_stopping_rounds,
                       verbose=verbose,
                       sample_weight_eval_set=sample_weight_eval_set)

        # early_stopping_rounds=10  过早停止的条件  # 默认使用值为10
        # verbose=True  # 是否开启冗余

    def plot_loss(self):  # 绘制loss
        result = self.model.evals_result()  #获取模型结果
        epochs = len(result["validation_0"]["rmse"])
        x_axis = range(0, epochs)
        # 绘制loss曲线图
        figure, ax = plt.subplots()
        ax.plot(x_axis, result["validation_0"]["rmse"], label="Train")
        ax.plot(x_axis, result["validation_1"]["rmse"], label="Test")
        ax.legend()
        plt.ylabel("loss")
        plt.title("Xgboost Log Loss")
        plt.show()

    def predict(self, x_test):
        """
        :param x_test:  #使用np.array、scipy.sparse  用于预测
        :return:
        """
        my_pred = self.model.predict(data=x_test,
                                     output_margin=False,
                                     validate_features=True,
                                     base_margin=None)
        return my_pred

    def plt_importance(self, figure_path=None, ifsave=True):  # 绘制重要性特征
        """
        :param figure_path:  图片保存路径
        :param ifsave:  是否保存图片
        :return:
        """
        # 绘制特征重要性
        fig, ax = plt.subplots(figsize=(15, 15))
        plot_importance(self.model, height=0.5, ax=ax,
                        max_num_features=64)  # 最多绘制64个特征
        if ifsave:
            if not figure_path:
                plt.savefig(
                    "../model/XGBboost_model/Xgboost_featute_importance_before.png"
                )
            else:
                plt.savefig(figure_path)
        plt.show()  # 显示图片

    def _plt_importance_v1(self,
                           columns_name,
                           figure_path=None,
                           ifsave=True):  # 绘制重要性特征,使用实际的列名进行替换
        fig, ax = plt.subplots(figsize=(15, 15))
        plot_importance_v1(self.model,
                           model_name="xgb",
                           columns_name=columns_name,
                           height=0.5,
                           ax=ax,
                           max_num_features=64)  # 最多绘制64个特征
        if ifsave:
            if not figure_path:
                plt.savefig(
                    "../model/XGBboost_model/Xgboost_featute_importance_after.png"
                )
            else:
                plt.savefig(figure_path)
        plt.show()  # 显示图片

    def plt_tree(self, num_tree):  # 绘制树
        """
        :param num_tree:  指定目标树的序号
        :return:
        """
        plot_tree(booster=self.model, num_trees=num_tree)

    def plot_graphviz(self, num_tree):  # 进行绘制graphviz
        to_graphviz(self.model, num_trees=num_tree)

    # 获取重要特征
    def get_importance(self):
        return self.model.feature_importances_

    # 评估函数
    def evaluate(self, y_test, my_pred, evalue_fun="mse"):
        if evalue_fun == "acc":  # 准确率    分类指标
            result = accuracy_score(y_true=y_test, y_pred=my_pred)
            print("accuarcy:%.2f" % (result * 100.0))
        elif evalue_fun == "auc":  # auc 值   分类指标
            result = roc_auc_score(y_true=y_test, y_score=my_pred)
            print("auc:%.2f" % (result))
        elif evalue_fun == "mae":  # 回归指标, 平均绝对误差
            result = mean_absolute_error(y_true=y_test, y_pred=my_pred)
            print("mae:%.2f" % (result))
        elif evalue_fun == "median_ae":  # 种植绝对误差  回归指标
            result = median_absolute_error(y_true=y_test, y_pred=my_pred)
            print("median_ae:%.2f" % (result))
        elif evalue_fun == "r2_score":  # R平方值   回归指标
            result = r2_score(y_true=y_test, y_pred=my_pred)
            print("r2_score:%.2f" % (result))
        elif evalue_fun == "evs":  # 回归反差,    回归指标
            result = explained_variance_score(y_true=y_test, y_pred=my_pred)
            print("explained_variance_score:%.2f" % (result))
        elif evalue_fun == "aps":  #  分类指标, 根据预测得分计算平均精度(AP)
            result = average_precision_score(y_true=y_test,
                                             y_score=my_pred,
                                             average="maco",
                                             sample_weight=None)
            print("average_precision_score:%.2f" % (result))
        elif evalue_fun == "bsl":
            result = brier_score_loss(y_true=y_test,
                                      y_prob=my_pred,
                                      sample_weight=None,
                                      pos_label=None)
            print("brier_score_loss:%.2f" % (result))
        elif evalue_fun == "cmt":  #计算混淆矩阵来评估分类的准确性   分类指标
            result = confusion_matrix(y_true=y_test,
                                      y_pred=my_pred,
                                      labels=None,
                                      sample_weight=None)
            print("confusion_matrix:%.2f" % (result))
        elif evalue_fun == "f1_score":  # f1 得分, 分类指标
            result = f1_score(y_true=y_test,
                              y_pred=my_pred,
                              labels=None,
                              pos_label=1,
                              average="binary",
                              sample_weight=None)  #F1值
            print("f1_score:%.2f" % (result))
        elif evalue_fun == "log_loss":  # 交叉熵孙绍, 分类指标
            result = log_loss(y_true=y_test,
                              y_pred=my_pred,
                              eps=1e-15,
                              normalize=True,
                              sample_weight=None,
                              labels=None)
            print("log_loss:%.2f" % (result))
        elif evalue_fun == "precision_score":  # 查准率   分类指标
            result = precision_score(y_true=y_test,
                                     y_pred=my_pred,
                                     labels=None,
                                     pos_label=1,
                                     average="binary")
            print("precision_score:%.2f" % (result))
        elif evalue_fun == "recall_score":  # 查全绿   分类指标
            result = recall_score(y_true=y_test,
                                  y_pred=my_pred,
                                  labels=None,
                                  pos_label=1,
                                  average="binary",
                                  sample_weight=None)
            print("recall_score:%.2f" % (result))
        elif evalue_fun == "roc_auc_score":  # 计算 roc 曲线下面的面积就是AUC值,  分类指标
            result = roc_auc_score(y_true=y_test,
                                   y_score=my_pred,
                                   average="macro",
                                   sample_weight=None)
            print("roc_auc_score:%.2f" % (result))
        elif evalue_fun == "roc_curve":  # 计算PROC曲线的横轴坐标  分类指标
            fpr, tpr, thresholds = roc_curve(y_true=y_test,
                                             y_score=my_pred,
                                             pos_label=None,
                                             sample_weight=None,
                                             drop_intermediate=True)
            result = (fpr, tpr, thresholds)
        else:  # mse 参数   均方差, 回归指标
            result = mean_squared_error(y_true=y_test, y_pred=my_pred)
            print("mse:%.2f" % (result))
        return result

    def save_model(self, save_params):  # 模型保存
        self.model.save_model(
            fname=save_params.get(
                "fname",
                "../model/XGBboost_model/XGboostmodel.model")  # 保存的文件路径名字
            # format=save_params.get("format", "cbm"),  # 保存的数据格式
            # pool=save_params.get("pool", None)  #  训练使用的数据   模型保存成json格式,无需使用pool
        )
Esempio n. 26
0
y_pred_model2 = model2.predict(X_test)
y22 = np.argmax(y_pred_model2,axis=1)
y_test22 = np.argmax(y_test , axis = 1)

count = 0
for i in range(y22.shape[0]):
    if y22[i] == y_test22[i]:
        count+=1
        
from xgboost import XGBClassifier
X_train2,X_test2,y_train2,y_test2 = train_test_split(feature_all,y,test_size = 0.3,random_state=20)        

model3 = XGBClassifier()
model3.fit(X_train2,y_train2)
model3.evals_result()
cross_val_score(model3, X_train2, y_train2, cv=5)
y_pred3 = model3.predict(X_test)

count = 0
for i in range(y_pred3.shape[0]):
    if y_pred3[i] == y_test2[i]:
        count+=1   
        
# clf = RandomForestClassifier(n_estimators=60,max_features=8,max_depth=None,min_samples_split=3,bootstrap=True,random_state=35)
# clf = clf.fit(X_train, y_train)
# #scores = cross_val_score(clf, X_train, y_train, cv=5)
# #print(scores.mean())
# y_pred = clf.predict(X_test)
# for i in range(np.shape(y_test))
        
Esempio n. 27
0
def gen_sub_by_para():
    #version = '1002'
    args = locals()
    logger.debug(f'Run train dnn:{args}')
    #feature_label = get_dynamic_feature(svd_cmp)
    feature_label = get_stable_feature('1011')

    train = feature_label[feature_label['sex'].notnull()]
    test = feature_label[feature_label['sex'].isnull()]

    X = train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    Y = train['age']
    Y_CAT = pd.Categorical(Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y_CAT.codes)

    gbm = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=22,
        max_depth=3,
        reg_alpha=10,
        reg_lambda=10,
        subsample=0.7,
        colsample_bytree=0.6,
        n_estimators=20000,
        learning_rate=0.01,
        seed=1,
        missing=None,

        #Useless Paras
        silent=True,
        gamma=0,
        max_delta_step=0,
        min_child_weight=1,
        colsample_bylevel=1,
        scale_pos_weight=1,
        **gpu_params)
    # print(random_search.grid_scores_)
    gbm.fit(X_train,
            y_train,
            eval_set=[
                (X_train, y_train),
                (X_test, y_test),
            ],
            early_stopping_rounds=100,
            verbose=True)

    results = gbm.evals_result()

    #print(results)

    best_epoch = np.array(results['validation_1']['mlogloss']).argmin() + 1
    best_score = np.array(results['validation_1']['mlogloss']).min()

    pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
    # sub=pd.DataFrame(gbm.predict_proba(pre_x))
    #
    #
    # sub.columns=Y_CAT.categories
    # sub['DeviceID']=test['device'].values
    # sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]
    #
    #
    # from sklearn.metrics import log_loss
    #
    # best = log_loss(y_test, gbm.predict_proba(X_test) )
    #
    # best = round(best, 4)
    #
    # #lgb.plot_importance(gbm, max_num_features=20)
    #
    # print(f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}')

    print_imp_list(X_train, gbm)

    # print(f'best_epoch:{best_epoch}_best_score:{best_score}')
    #
    # file = f'./sub/baseline_xgb_{best}_{args}_epoch_{best_epoch}.csv'
    # file = replace_invalid_filename_char(file)
    # print(f'sub file save to {file}')
    # sub = round(sub,10)
    # sub.to_csv(file,index=False)
    #

    ###Save result for ensemble
    train_bk = pd.DataFrame(gbm.predict_proba(
        train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)),
                            index=train.device,
                            columns=Y_CAT.categories)

    test_bk = pd.DataFrame(gbm.predict_proba(pre_x),
                           index=test.device,
                           columns=Y_CAT.categories)

    label_bk = pd.DataFrame(
        {'label': Y_CAT.codes},
        index=train.device,
    )

    save_result_for_ensemble(
        f'{best_score}_{best_epoch}_xgb_age_{args}',
        train=train_bk,
        test=test_bk,
        label=label_bk,
    )
Esempio n. 28
0
y = datasets.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

#2. 모델링
model = XGBClassifier(n_estimators=100, learning_rate=0.01, n_jobs=-1)

#3. 훈련
model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric=['merror', 'mlogloss'],
          eval_set=[(x_train, y_train), (x_test, y_test)])

#4. 평가
result1 = model.score(x_test, y_test)
print("result1 : ", result1)

y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("acc : ", acc)

result2 = model.evals_result()
print("result2 : ", result2)

# result1 :  0.9722222222222222
# acc :  0.9722222222222222
Esempio n. 29
0
def modeling():
    print("开始建模")
    # train = pd.read_csv("./small_train.csv")
    train = pd.read_csv("./train.csv", nrows=10000)

    train = train[train['weight'] != 0]
    train['action'] = ((train['weight'].values * train['resp'].values) >
                       0).astype('int')

    X_train = train.loc[:, train.columns.str.contains('feature')]
    y_train = train.loc[:, 'action']

    X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                        y_train,
                                                        random_state=666,
                                                        test_size=0.2)

    del train

    X_train = X_train.fillna(-999)
    sampler = TPESampler(seed=666)
    tm = "auto"

    def create_model(trial):
        max_depth = trial.suggest_int("max_depth", 2, 12)
        n_estimators = trial.suggest_int("n_estimators", 2, 600)
        learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
        subsample = trial.suggest_uniform('subsample', 0.0001, 1.0)
        colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0000001,
                                                 1)
        model = XGBClassifier(n_estimators=n_estimators,
                              max_depth=max_depth,
                              learning_rate=learning_rate,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              random_state=666,
                              tree_method=tm,
                              silent=1)

        return model

    def objective(trial):
        model = create_model(trial)
        model.fit(X_train, y_train)
        score = accuracy_score(y_train, model.predict(X_train))
        return score

    params1 = {
        'max_depth': 8,
        'n_estimators': 500,
        'learning_rate': 0.01,
        'subsample': 0.9,
        'tree_method': tm,
        'random_state': 666
    }

    params3 = {
        'max_depth': 10,
        'n_estimators': 500,
        'learning_rate': 0.03,
        'subsample': 0.9,
        'colsample_bytree': 0.7,
        'tree_method': tm,
        'random_state': 666
    }

    start_time = time.time()
    model1 = XGBClassifier(**params1)
    model1.fit(X_train, y_train, eval_metric='auc')
    model1.fit(X_train,
               y_train,
               eval_set=[(X_train, y_train), (X_test, y_test)],
               eval_metric='auc',
               verbose=False)
    evals_result = model1.evals_result()
    print("模型1评分")
    y_true, y_pred = y_test, model1.predict(X_test)
    print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))

    model3 = XGBClassifier(**params3)
    model3.fit(X_train, y_train, eval_metric='auc')
    model3.fit(X_train,
               y_train,
               eval_set=[(X_train, y_train), (X_test, y_test)],
               eval_metric='auc',
               verbose=False)
    evals_result = model3.evals_result()
    print("模型3评分")
    y_true, y_pred = y_test, model3.predict(X_test)
    print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
    end_time = time.time()
    print("建模时间:%.2f秒" % (end_time - start_time))

    return (model1, model3)
Esempio n. 30
0
class Classifier:

    # for initializing train and test sets, classifier and accuracy score
    # Change method to gpu_hist if you want xgboost to run on a GPU
    def __init__(self,
                 params={
                     'objective': 'reg:squarederror',
                     'verbosity': 0
                 }):
        self.X_train = []
        self.X_labels = []
        self.test = []
        self.test_labels = []
        self.model = XGBClassifier(**params)
        self.prediction = 0
        self.error = 0

    def size(self):
        if isinstance(self.X_train, np.ndarray):
            return self.X_train.size
        return len(self.X_train)

    # adding the data points
    def input_train(self, features, feature):
        if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0:
            self.X_train = self.X_train.tolist()
            self.X_labels = self.X_labels.tolist()
        self.X_train.append(features)
        self.X_labels.append(feature)

    # train the data
    def train(self):
        self.X_train = np.asarray(self.X_train)
        self.X_labels = np.asarray(self.X_labels)
        self.model.fit(self.X_train, self.X_labels)

    def train_eval(self, metric='error'):
        self.X_train = np.asarray(self.X_train)
        self.X_labels = np.asarray(self.X_labels)
        X_train, X_test, y_train, y_test = train_test_split(self.X_train,
                                                            self.X_labels,
                                                            test_size=0.33)
        self.model.fit(X_train,
                       y_train,
                       eval_set=[(X_train, y_train), (X_test, y_test)],
                       eval_metric=metric)
        evals_result = self.model.evals_result()
        if metric == 'error':
            validations = []
            for val in evals_result.values():
                lst = val.get("error")
                validations.append(sum(lst) / len(lst))
            return 1 - (sum(validations) / len(validations))
        else:
            validations = []
            for val in evals_result.values():
                lst = val.get(metric)
                validations.append(lst[-1])
            return validations

    # input test labels if you want to check accuracy
    def label(self, label):
        self.test_labels.append(label)

    def input_test(self, features):
        if isinstance(self.test, np.ndarray) and self.test.size > 0:
            self.test = self.test.tolist()
        self.test.append(features)

    # test data
    def predict(self):
        if not isinstance(self.test, np.ndarray):
            self.test = np.asarray(self.test)
        self.prediction = self.model.predict(self.test)
        return self.prediction

    def predict_proba(self):
        if not isinstance(self.test, np.ndarray):
            self.test = np.asarray(self.test)
        self.prediction = self.model.predict_proba(self.test)
        return self.prediction

    # if you have the test labels you can check the error rate (you want error close to 0)
    def check_error(self):
        self.test_labels = np.asarray(self.test_labels)
        self.error = metrics.mean_absolute_error(self.test_labels,
                                                 self.prediction)
        return self.error

    # save classifier
    def save_classifier(self, file):
        self.model.save_model(file)

    # open saved classifier
    def open_classifier(self, file):
        self.model.load_model(file)

    # removes all training data
    def clean_train(self):
        self.X_train = []
        self.X_labels = []

    # removes all testing data
    def clean_test(self):
        self.test = []
        self.test_labels = []
Esempio n. 31
0
    y_train,  # labels (Y=1 signal, Y=0 background)
    sample_weight=w_train,  # instance weights
    eval_set=[
        (x_train, y_train), (x_val, y_val)
    ],  # a list of (X,y) tuple pairs to use as validation sets ---> validation_0=train, validation_1=validation
    sample_weight_eval_set=[
        w_train, w_val
    ],  # list of arrays storing instances weights for the i-th validation set
    eval_metric=[
        'auc', 'error'
    ],  # list of parameters under eval_metric: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
    early_stopping_rounds=
    300,  # validation metric needs to improve at least once in every early_stopping_rounds round(s)
    verbose=100)

results = model.evals_result()  # takes the results from the BDT training above
n_estimators = len(results['validation_0']
                   ['error'])  # number of rounds used for the BDT training
auc_train = results['validation_0']['auc']  # subsample: auc for training
auc_val = results['validation_1']['auc']  # subsample: auc for validation
error_train = results['validation_0']['error']  # subsample: error for training
error_val = results['validation_1']['error']  # subsample: error for validation

plt.figure(figsize=(15, 5))

# --- plot auc for training and validation
plt.subplot(121)
plt.plot(range(0, n_estimators), auc_train, c='blue', label='train')
plt.plot(range(0, n_estimators), auc_val, c='orange', label='validation')
ymin = min(min(auc_train), min(auc_val))
ymax = max(max(auc_train), max(auc_val))