Esempio n. 1
0
def test_early_stopping_classification(
    data, scoring, validation_fraction, early_stopping, n_iter_no_change, tol
):

    max_iter = 50

    X, y = data

    gb = HistGradientBoostingClassifier(
        verbose=1,  # just for coverage
        min_samples_leaf=5,  # easier to overfit fast
        scoring=scoring,
        tol=tol,
        early_stopping=early_stopping,
        validation_fraction=validation_fraction,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0,
    )
    gb.fit(X, y)

    if early_stopping is True:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
Esempio n. 2
0
def test_binning_train_validation_are_separated():
    # Make sure training and validation data are binned separately.
    # See issue 13926

    rng = np.random.RandomState(0)
    validation_fraction = .2
    gb = HistGradientBoostingClassifier(
        n_iter_no_change=5,
        validation_fraction=validation_fraction,
        random_state=rng)
    gb.fit(X_classification, y_classification)
    mapper_training_data = gb.bin_mapper_

    # Note that since the data is small there is no subsampling and the
    # random_state doesn't matter
    mapper_whole_data = _BinMapper(random_state=0)
    mapper_whole_data.fit(X_classification)

    n_samples = X_classification.shape[0]
    assert np.all(
        mapper_training_data.actual_n_bins_ == int((1 - validation_fraction) *
                                                   n_samples))
    assert np.all(
        mapper_training_data.actual_n_bins_ != mapper_whole_data.actual_n_bins_
    )
Esempio n. 3
0
def hist_gradient_boosting_classifier(x_train, y_train, x_test, y_test):
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingClassifier
    ensem = HistGradientBoostingClassifier()
    ensem.fit(x_train, y_train)
    value = ensem.score(x_test, y_test)
    return "{0:.2f}".format(value)
def test_invalid_classification_loss():
    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
    err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
               "classification with n_classes=3, use "
               "loss='categorical_crossentropy' instead")
    with pytest.raises(ValueError, match=err_msg):
        binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
def test_invalid_classification_loss():
    binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy")
    err_msg = ("loss='binary_crossentropy' is not defined for multiclass "
               "classification with n_classes=3, use "
               "loss='categorical_crossentropy' instead")
    with pytest.raises(ValueError, match=err_msg):
        binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
def test_zero_sample_weights_classification():
    # Make sure setting a SW to zero amounts to ignoring the corresponding
    # sample

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    gb = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                        min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(gb.predict([[1, 0]]), [1])

    X = [[1, 0],
         [1, 0],
         [1, 0],
         [0, 1],
         [1, 1]]
    y = [0, 0, 1, 0, 2]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1, 1]
    gb = HistGradientBoostingClassifier(loss='categorical_crossentropy',
                                        min_samples_leaf=1)
    gb.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(gb.predict([[1, 0]]), [1])
Esempio n. 7
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification
    pytest.importorskip("lightgbm")

    rng = np.random.RandomState(seed=seed)
    max_iter = 1
    max_bins = 255

    X, y = make_classification(
        n_samples=n_samples,
        n_classes=2,
        n_features=5,
        n_informative=5,
        n_redundant=0,
        random_state=0,
    )

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss="binary_crossentropy",
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        early_stopping=False,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm")

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > 0.89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
Esempio n. 8
0
def test_string_target_early_stopping(scoring):
    # Regression tests for #14709 where the targets need to be encoded before
    # to compute the score
    rng = np.random.RandomState(42)
    X = rng.randn(100, 10)
    y = np.array(["x"] * 50 + ["y"] * 50, dtype=object)
    gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring)
    gbrt.fit(X, y)
Esempio n. 9
0
def histGradientModel(X_train,Y_train):
    # use Hist Gradient Boosting Classifier
    from sklearn.experimental import enable_hist_gradient_boosting
    from sklearn.ensemble import HistGradientBoostingClassifier
    histgrad=HistGradientBoostingClassifier()
    histgrad.fit(X_train,y_train)
    print('\nHist Gradient Boosting Training Score:',histgrad.score(X_train,Y_train))
    return histgrad,histgrad.score(X_train,Y_train)
Esempio n. 10
0
    def common_test_model_hgb_classifier(self, add_nan=False, n_classes=2):
        model = HistGradientBoostingClassifier(max_iter=5, max_depth=2)
        X, y = make_classification(n_features=10,
                                   n_samples=1000,
                                   n_informative=4,
                                   n_classes=n_classes,
                                   random_state=42)
        if add_nan:
            rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
            cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
            X[rows, cols] = numpy.nan

        X_train, X_test, y_train, _ = train_test_split(X,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)
        model.fit(X_train, y_train)

        model_onnx = convert_sklearn(
            model, "unused", [("input", FloatTensorType([None, X.shape[1]]))])
        self.assertIsNotNone(model_onnx)
        X_test = X_test.astype(numpy.float32)[:5]

        dump_data_and_model(X_test,
                            model,
                            model_onnx,
                            basename="SklearnHGBClassifier%s%d" %
                            ("nan" if add_nan else '', n_classes),
                            verbose=False,
                            allow_failure="StrictVersion(onnx.__version__)"
                            " < StrictVersion('1.2') or "
                            "StrictVersion(onnxruntime.__version__)"
                            " <= StrictVersion('0.2.1')")

        if n_classes == 2:
            model_onnx = convert_sklearn(
                model,
                "unused", [("input", FloatTensorType([None, X.shape[1]]))],
                options={model.__class__: {
                    'raw_scores': True
                }})
            self.assertIsNotNone(model_onnx)
            X_test = X_test.astype(numpy.float32)[:5]

            # There is a bug in onnxruntime <= 1.1.0.
            # Raw scores are always positive.
            dump_data_and_model(
                X_test,
                model,
                model_onnx,
                basename="SklearnHGBClassifierRaw%s%d" %
                ("nan" if add_nan else '', n_classes),
                verbose=False,
                allow_failure="StrictVersion(onnx.__version__)"
                " < StrictVersion('1.2') or "
                "StrictVersion(onnxruntime.__version__)"
                " < StrictVersion('1.2.0')",
                methods=['predict', 'decision_function_binary'])
def test_crossentropy_binary_problem():
    # categorical_crossentropy should only be used if there are more than two
    # classes present. PR #14869
    X = [[1], [0]]
    y = [0, 1]
    gbrt = HistGradientBoostingClassifier(loss='categorical_crossentropy')
    with pytest.raises(ValueError,
                       match="'categorical_crossentropy' is not suitable for"):
        gbrt.fit(X, y)
Esempio n. 12
0
def model_(X, y, rs):
    osp = RandomUnderSampler(random_state=rs)
    x_train_, y_train_ = osp.fit_sample(X, y)
    # # 基础模型
    # clf = CatBoostClassifier(loss_function='Logloss',
    #                          logging_level='Silent',
    #                          cat_features=categorical_features_indices)
    clf = HistGradientBoostingClassifier(random_state=10)
    clf.fit(x_train_, y_train_)
    return clf
def k_fold_trainning(rawdata,n_folds=5):

    cv = StratifiedKFold(n_splits=n_folds,shuffle=True)
    target = np.array(rawdata[0].values)
    lure = np.array(rawdata[1].values)
    y = np.array(rawdata['label'].values)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    for i, (train, test) in enumerate(cv.split(target, lure, y)):
        print('----------------Training Fold %d---------------'%(i+1))
        X_train = pd.DataFrame({0:target[train],1:lure[train]})
        X_test = pd.DataFrame({0:target[test],1:lure[test]})
        pmfm = create_pmfm(X_train,y[train])
        train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
        train_data = np.matrix([train_feature[i] for i in range(train_feature.shape[0])])
        test_data = np.matrix([test_feature[i] for i in range(test_feature.shape[0])])
        clf.fit(train_data, y[train])
        pred = clf.predict(test_data)
        evaluate(y[test], pred)
        viz = plot_roc_curve(clf, test_data, y[test],
                            name='ROC fold {}'.format(i+1),
                            alpha=0.5, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title="Receiver operating characteristic Curve")
    ax.legend(loc="lower right")
    plt.savefig('roc.png',dpi=300)
def train_model(rawdata, compute_importance=True):
    X,y = create_Input(rawdata)
    pmfm = create_pmfm(X,y)
    np.save("feature.npy",pmfm)
    feature = X.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
    clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
    data = np.matrix([feature[i] for i in range(feature.shape[0])])
    clf.fit(data, y)
    joblib.dump(clf, "train_model.pkl")
    if compute_importance:
        feature_importance(clf, data, y)
def test_zero_division_hessians(data):
    # non regression test for issue #14018
    # make sure we avoid zero division errors when computing the leaves values.

    # If the learning rate is too high, the raw predictions are bad and will
    # saturate the softmax (or sigmoid in binary classif). This leads to
    # probabilities being exactly 0 or 1, gradients being constant, and
    # hessians being zero.
    X, y = data
    gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10)
    gb.fit(X, y)
def test_early_stopping_on_test_set_with_warm_start():
    # Non regression test for #16661 where second fit fails with
    # warm_start=True, early_stopping is on, and no validation set
    X, y = make_classification(random_state=0)
    gb = HistGradientBoostingClassifier(
        max_iter=1, scoring='loss', warm_start=True, early_stopping=True,
        n_iter_no_change=1, validation_fraction=None)

    gb.fit(X, y)
    # does not raise on second call
    gb.set_params(max_iter=2)
    gb.fit(X, y)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 256

    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_sklearn = HistGradientBoostingClassifier(
        loss='binary_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_sklearn.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_sklearn = est_sklearn.predict(X_train)
    assert np.mean(pred_sklearn == pred_lightgbm) > .89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_sklearn = accuracy_score(y_train, pred_sklearn)
    np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_sklearn = est_sklearn.predict(X_test)
        assert np.mean(pred_sklearn == pred_lightgbm) > .89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_sklearn = accuracy_score(y_test, pred_sklearn)
        np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
Esempio n. 18
0
    def common_test_model_hgb_classifier(self, add_nan=False, n_classes=2):
        model = HistGradientBoostingClassifier(max_iter=5, max_depth=2)
        X, y = make_classification(n_features=10,
                                   n_samples=1000,
                                   n_informative=4,
                                   n_classes=n_classes,
                                   random_state=42)
        if add_nan:
            rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
            cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
            X[rows, cols] = numpy.nan

        X_train, X_test, y_train, _ = train_test_split(X,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)
        model.fit(X_train, y_train)

        if n_classes == 2:
            model_onnx = convert_sklearn(
                model,
                "unused", [("input", FloatTensorType([None, X.shape[1]]))],
                options={model.__class__: {
                    'raw_scores': True
                }})
            self.assertIsNotNone(model_onnx)
            X_test = X_test.astype(numpy.float32)[:5]

            dump_data_and_model(
                X_test,
                model,
                model_onnx,
                basename="SklearnHGBClassifierRaw%s%d" %
                ("nan" if add_nan else '', n_classes),
                verbose=False,
                intermediate_steps=True,
                methods=['predict', 'decision_function_binary'],
                backend=['python'])

        model_onnx = convert_sklearn(
            model, "unused", [("input", FloatTensorType([None, X.shape[1]]))])
        self.assertIsNotNone(model_onnx)
        X_test = X_test.astype(numpy.float32)[:5]

        dump_data_and_model(X_test,
                            model,
                            model_onnx,
                            basename="SklearnHGBClassifier%s%d" %
                            ("nan" if add_nan else '', n_classes),
                            verbose=False)
Esempio n. 19
0
def test_missing_values_trivial():
    # sanity check for missing values support. With only one feature and
    # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the
    # training set.

    n_samples = 100
    n_features = 1
    rng = np.random.RandomState(0)

    X = rng.normal(size=(n_samples, n_features))
    mask = rng.binomial(1, 0.5, size=X.shape).astype(bool)
    X[mask] = np.nan
    y = mask.ravel()
    gb = HistGradientBoostingClassifier()
    gb.fit(X, y)

    assert gb.score(X, y) == pytest.approx(1)
Esempio n. 20
0
def k_fold_cross_val(Xs, y_var, k=10):
    clf = tree.DecisionTreeClassifier()
    clf_forest = RandomForestClassifier(n_estimators=10)
    clf_boost = HistGradientBoostingClassifier()

    num_folds = k
    N = Xs.shape[0]
    test_size = int(N / num_folds)

    test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape(
        num_folds, test_size)

    total_score = np.asarray([0., 0., 0.])
    total_F1_score = np.asarray([0., 0., 0.])

    for i in range(num_folds):
        print("Iteration " + str(i) + ":")
        test_i = Xs.index.isin(test_idxs[i])
        df_train, df_test = Xs[~test_i], Xs[test_i]
        y_train, y_test = y_var[~test_i], y_var[test_i]

        clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel())
        score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel())

        clf_forest = clf_forest.fit(df_train.to_numpy(),
                                    y_train.to_numpy().ravel())
        score_f = clf_forest.score(df_test.to_numpy(),
                                   y_test.to_numpy().ravel())

        clf_boost = clf_boost.fit(df_train.to_numpy(),
                                  y_train.to_numpy().ravel())
        score_h = clf_boost.score(df_test.to_numpy(),
                                  y_test.to_numpy().ravel())

        y_hat = clf.predict(df_test.to_numpy())
        f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (tree):", f1_b)

        y_hat = clf_forest.predict(df_test.to_numpy())
        f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (forest):", f1_f)

        y_hat = clf_boost.predict(df_test.to_numpy())
        f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary')
        print("F1 score (boost):", f1_boost)

        print("Prediction scores for (tree,forest,boost):", score_b, score_f,
              score_h)
        total_score += np.asarray([score_b, score_f, score_h])
        total_F1_score += np.asarray([f1_b, f1_f, f1_boost])

    print("Avg. accuracy scores for (tree,forest,boost):",
          total_score / num_folds)
    print("Avg. F1 scores for (tree,forest,boost):",
          total_F1_score / num_folds)

    return clf, clf_forest, clf_boost
Esempio n. 21
0
def test_categorical_encoding_strategies():
    # Check native categorical handling vs different encoding strategies. We
    # make sure that native encoding needs only 1 split to achieve a perfect
    # prediction on a simple dataset. In contrast, OneHotEncoded data needs
    # more depth / splits, and treating categories as ordered (just using
    # OrdinalEncoder) requires even more depth.

    # dataset with one random continuous feature, and one categorical feature
    # with values in [0, 5], e.g. from an OrdinalEncoder.
    # class == 1 iff categorical value in {0, 2, 4}
    rng = np.random.RandomState(0)
    n_samples = 10_000
    f1 = rng.rand(n_samples)
    f2 = rng.randint(6, size=n_samples)
    X = np.c_[f1, f2]
    y = np.zeros(shape=n_samples)
    y[X[:, 1] % 2 == 0] = 1

    # make sure dataset is balanced so that the baseline_prediction doesn't
    # influence predictions too much with max_iter = 1
    assert 0.49 < y.mean() < 0.51

    clf_cat = HistGradientBoostingClassifier(
        max_iter=1, max_depth=1, categorical_features=[False, True]
    )

    # Using native categorical encoding, we get perfect predictions with just
    # one split
    assert cross_val_score(clf_cat, X, y).mean() == 1

    # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21
    expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0]
    left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0]
    assert_array_equal(left_bitset, expected_left_bitset)

    # Treating categories as ordered, we need more depth / more splits to get
    # the same predictions
    clf_no_cat = HistGradientBoostingClassifier(
        max_iter=1, max_depth=4, categorical_features=None
    )
    assert cross_val_score(clf_no_cat, X, y).mean() < 0.9

    clf_no_cat.set_params(max_depth=5)
    assert cross_val_score(clf_no_cat, X, y).mean() == 1

    # Using OHEd data, we need less splits than with pure OEd data, but we
    # still need more splits than with the native categorical splits
    ct = make_column_transformer(
        (OneHotEncoder(sparse=False), [1]), remainder="passthrough"
    )
    X_ohe = ct.fit_transform(X)
    clf_no_cat.set_params(max_depth=2)
    assert cross_val_score(clf_no_cat, X_ohe, y).mean() < 0.9

    clf_no_cat.set_params(max_depth=3)
    assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
Esempio n. 22
0
    def foo(Xs, Xg, ys, yg):

        # For each subset, make splits
        kf = KFold(n_splits=n_splits, random_state=0)
        # Store ITEs
        ite = []
        # For each split
        for idx1, idx2 in kf.split(Xs):
            # Init models
            ms = HistGradientBoostingRegressor()
            mg = HistGradientBoostingClassifier()
            # Train models
            ms.fit(Xs[idx1], ys[idx1])
            mg.fit(Xg[idx1], yg[idx1])
            # Make estimates on test set
            ite.append(
                AIPW_estimator(ms, mg, Xs[idx2], Xg[idx2], ys[idx2], yg[idx2]))
        # Return mean ite and n_employees
        return np.concatenate(ite).mean(), len(Xs)
Esempio n. 23
0
def gradient_boost(train_data, test_data):
    train_y = train_data['state']
    train_X = train_data.iloc[:, FEATURES_INDICES]

    test_y = test_data['state']
    test_X = test_data.iloc[:, FEATURES_INDICES]

    #search(train_X, train_y)
    #search_xgboost(train_X, train_y)
    gd = HistGradientBoostingClassifier(loss='auto',
                                        max_bins=200,
                                        max_depth=10,
                                        max_leaf_nodes=35)

    #gd = XGBClassifier()
    gd.fit(train_X, train_y)

    pred_y = gd.predict(test_X)
    evaluate(gd, test_X, test_y, pred_y)
Esempio n. 24
0
def main():
    # loading the dataset from sklearn.datasets
    df_cancer = load_breast_cancer()
    print(df_cancer.keys())
    X = df_cancer.data
    y = df_cancer.target
    print("number of classes are: ", np.unique(y))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    # create object of historgradientboosting
    hist = HistGradientBoostingClassifier()
    # training the model
    hist.fit(X_train, y_train)
    y_pred = hist.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy of the model is: ", accuracy)
    clr = classification_report(y_test, y_pred)
    print("Classification report is:", clr)
Esempio n. 25
0
 def fit(
     self,
     params,
     X_train,
     y_train,
     Xy_val,
     sample_weight,
     n_estimators=None,
     seed=None,
 ):
     # Xy_val not used
     if seed is not None:
         params.update({"random_state": seed})
     if n_estimators is not None:
         params.update({"max_iter": n_estimators})
     clf = HistGradientBoostingClassifier(
         **params,
         categorical_features=self.categorical_features,
         early_stopping="auto")
     clf.fit(X_train, y_train, sample_weight=sample_weight)
     return clf, None
def test_on_target(rawdata, sitename):
    print('------------Testing on %s-----------' % sitename)
    target_info = pd.read_csv("target_info.csv")
    if sitename in target_info['Site'].values:
        target_dict = target_info.set_index('Site').T.to_dict()
        sequence = target_dict[sitename]['Sequence']
        train_data = rawdata[rawdata[0]!=sequence]
        test_data = rawdata[rawdata[0]==sequence]
        X_train, y_train = create_Input(train_data)
        X_test, y_test = create_Input(test_data)
        pmfm = create_pmfm(X_train,y_train)
        train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values
        clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100)
        train_matrix = np.matrix([train_feature[i] for i in range(train_feature.shape[0])])
        test_matrix = np.matrix([test_feature[i] for i in range(test_feature.shape[0])])
        clf.fit(train_matrix, y_train)
        pred = clf.predict(test_matrix)
        evaluate(y_test, pred)
    else:
        print('ERROR: INCORRECT SITE NAME')
Esempio n. 27
0
class MyHGBClassifierModel(BaseModel):
    def __init__(self, model_params, fit_params: Optional[Dict]):
        self.model_params = model_params
        self.fit_params = fit_params
        if self.fit_params is None:
            self.fit_params = {}

    def build_model(self):
        self.model = HistGradientBoostingClassifier(**self.model_params)
        return self.model

    def fit(self, train_x, train_y, valid_x=None, valid_y=None):
        self.model = self.build_model()
        self.model.fit(
            train_x, train_y,
            **self.fit_params
        )
        return self.model

    def predict(self, est, valid_x):
        preds = est.predict_proba(valid_x)[:, 1]
        return preds
def test_early_stopping_classification(data, scoring, validation_fraction,
                                       n_iter_no_change, tol):

    max_iter = 50

    X, y = data

    gb = HistGradientBoostingClassifier(
        verbose=1,  # just for coverage
        min_samples_leaf=5,  # easier to overfit fast
        scoring=scoring,
        tol=tol,
        validation_fraction=validation_fraction,
        max_iter=max_iter,
        n_iter_no_change=n_iter_no_change,
        random_state=0
    )
    gb.fit(X, y)

    if n_iter_no_change is not None:
        assert n_iter_no_change <= gb.n_iter_ < max_iter
    else:
        assert gb.n_iter_ == max_iter
def test_binning_train_validation_are_separated():
    # Make sure training and validation data are binned separately.
    # See issue 13926

    rng = np.random.RandomState(0)
    validation_fraction = .2
    gb = HistGradientBoostingClassifier(
        n_iter_no_change=5,
        validation_fraction=validation_fraction,
        random_state=rng
    )
    gb.fit(X_classification, y_classification)
    mapper_training_data = gb.bin_mapper_

    # Note that since the data is small there is no subsampling and the
    # random_state doesn't matter
    mapper_whole_data = _BinMapper(random_state=0)
    mapper_whole_data.fit(X_classification)

    n_samples = X_classification.shape[0]
    assert np.all(mapper_training_data.actual_n_bins_ ==
                  int((1 - validation_fraction) * n_samples))
    assert np.all(mapper_training_data.actual_n_bins_ !=
                  mapper_whole_data.actual_n_bins_)
Esempio n. 30
0
    def common_test_model_hgb_classifier(self, add_nan=False, n_classes=2):
        model = HistGradientBoostingClassifier(max_iter=5, max_depth=2)
        X, y = make_classification(n_features=10,
                                   n_samples=1000,
                                   n_informative=4,
                                   n_classes=n_classes,
                                   random_state=42)
        if add_nan:
            rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3)
            cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3)
            X[rows, cols] = numpy.nan

        X_train, X_test, y_train, _ = train_test_split(X,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)
        model.fit(X_train, y_train)

        model_onnx = convert_sklearn(
            model, "unused", [("input", FloatTensorType([None, X.shape[1]]))])
        self.assertIsNotNone(model_onnx)
        X_test = X_test.astype(numpy.float32)[:5]

        dump_data_and_model(X_test, model, model_onnx, folder=self.folder)
Esempio n. 31
0
def test_infinite_values_missing_values():
    # High level test making sure that inf and nan values are properly handled
    # when both are present. This is similar to
    # test_split_on_nan_with_infinite_values() in test_grower.py, though we
    # cannot check the predicitons for binned values here.

    X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1)
    y_isnan = np.isnan(X.ravel())
    y_isinf = X.ravel() == np.inf

    stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1,
                                               learning_rate=1, max_depth=2)

    assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1
    assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
Esempio n. 32
0
File: p4.py Progetto: i72sijia/IMD
def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs,
                                              trainOutputs, testInputs,
                                              testOutputs, graphname):
    print("\n[" + str(graphname) + "]")
    scoreArray = np.array([])
    clf = HistGradientBoostingClassifier()
    scores = cross_val_score(clf, X, y, cv=10)
    clf = clf.fit(trainInputs, trainOutputs)
    precisionTrain = clf.score(trainInputs, trainOutputs)
    precisionTest = clf.score(testInputs, testOutputs)
    print("\tCCR train = %.2f%% | CCR test = %.2f%%" %
          (precisionTrain * 100, precisionTest * 100))
    prediccion_test = clf.predict(testInputs)
    print(prediccion_test)
    print(testOutputs)
    return precisionTest
    data_train, target_train = data_train[:subsample], target_train[:subsample]

n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

print("Fitting a sklearn model...")
tic = time()
est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                     learning_rate=lr,
                                     max_iter=n_trees,
                                     max_bins=max_bins,
                                     max_leaf_nodes=n_leaf_nodes,
                                     n_iter_no_change=None,
                                     random_state=0,
                                     verbose=1)
est.fit(data_train, target_train)
toc = time()
predicted_test = est.predict(data_test)
predicted_proba_test = est.predict_proba(data_test)
roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

if args.lightgbm:
    print("Fitting a LightGBM model...")
    tic = time()
    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
    lightgbm_est.fit(data_train, target_train)
    toc = time()
    predicted_test = lightgbm_est.predict(data_test)
    predicted_proba_test = lightgbm_est.predict_proba(data_test)
# ############################################################ BaggingClassifier
clf_bc = BaggingClassifier(base_estimator=SVC(),
                           n_estimators=10,
                           random_state=0)
clf_bc.fit(x_train, y_train)
bc_pred = clf_bc.predict(x_test)
bc_matrices = evaluate_preds(clf_bc, x_test, y_test, bc_pred)
# ################################################ ExtraTreesClassifier
clf_etc = ExtraTreesClassifier()
clf_etc.fit(x_train, y_train)
etc_pred = clf_etc.predict(x_test)
et_matrices = evaluate_preds(clf_etc, x_test, y_test, etc_pred)
# ############################################################
# ############################################################ HistGradientBoostingClassifier
clf_hgbc = HistGradientBoostingClassifier()
clf_hgbc.fit(x_train, y_train)
hgbc_pred = clf_hgbc.predict(x_test)
hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred)
# ############################################################
# ############################################################ LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
clf_pred = clf_lr.predict(x_test)
lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred)
# ############################################################
# ############################################################ StackingClassifier
clf_sc = StackingClassifier(estimators=estimators,
                            final_estimator=LogisticRegression())
clf_sc.fit(x_train, y_train)
clf_pred = clf_sc.predict(x_test)
sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred)
Esempio n. 35
0
multi_clf.fit(patient_train_data, train_labels.to_numpy()[:, 1:11])

res = 1 / (1 + np.exp(-multi_clf.decision_function(patient_test_data)))

task1_df = pd.DataFrame(data=res, columns=train_labels.columns[1:11])

task1_df.to_csv('subtask1.csv', index=False, header=True)

t2 = time.time()
print('subtask1, time taken: ', t2 - t1)
print(task1_df)

#subtask 2
t1 = time.time()
clf.fit(patient_train_data, train_labels.to_numpy()[:, 11])

res = 1 / (1 + np.exp(-clf.decision_function(patient_test_data)))

task2_df = pd.DataFrame(data=res, columns=[train_labels.columns[11]])

task2_df.to_csv('subtask2.csv', index=False, header=True)

t2 = time.time()
print('subtask2, time taken: ', t2 - t1)
print(task2_df)

#subtask 3
t1 = time.time()
reg = HistGradientBoostingRegressor(random_state=1510)