コード例 #1
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    base_estimator = DecisionTreeClassifier().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    # disable the resampling by passing an empty dictionary.
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=False,
        n_estimators=10,
        ratio={},
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) ==
            base_estimator.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=True,
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) <
            base_estimator.score(X_train, y_train))
コード例 #2
0
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    base_estimator = DecisionTreeClassifier().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    # disable the resampling by passing an empty dictionary.
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=False,
        n_estimators=10,
        ratio={},
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) ==
            base_estimator.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=True,
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) <
            base_estimator.score(X_train, y_train))
コード例 #3
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]:
        clf = BalancedBaggingClassifier(base_estimator=base_estimator,
                                        n_estimators=100,
                                        bootstrap=True,
                                        oob_score=True,
                                        random_state=0).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        assert_warns(
            UserWarning,
            BalancedBaggingClassifier(base_estimator=base_estimator,
                                      n_estimators=1,
                                      bootstrap=True,
                                      oob_score=True,
                                      random_state=0).fit, X_train, y_train)
コード例 #4
0
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BalancedBaggingClassifier(
            base_estimator=base_estimator,
            n_estimators=100,
            bootstrap=True,
            oob_score=True,
            random_state=0).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        assert_warns(UserWarning,
                     BalancedBaggingClassifier(
                         base_estimator=base_estimator,
                         n_estimators=1,
                         bootstrap=True,
                         oob_score=True,
                         random_state=0).fit,
                     X_train,
                     y_train)
print('Validation Results')
print(clf_rf.score(x_val, y_val))
print(recall_score(y_val, clf_rf.predict(x_val)))
print(precision_score(y_val, clf_rf.predict(x_val)))

print('\nTest Results')
print(clf_rf.score(data_features_test, data_labels_test))
print(recall_score(data_labels_test, clf_rf.predict(data_features_test)))
print(precision_score(data_labels_test, clf_rf.predict(data_features_test)))

print("END")
bbc = BalancedBaggingClassifier(random_state=12)
bbc.fit(x_train, np.array(y_train.iloc[:, 0]))

print('Validation Results')
print(bbc.score(x_val, y_val))
print(recall_score(y_val, bbc.predict(x_val)))
print(precision_score(y_val, bbc.predict(x_val)))
print('\nTest Results')
print(bbc.score(data_features_test, data_labels_test))
print(recall_score(data_labels_test, bbc.predict(data_features_test)))
print(precision_score(data_labels_test, bbc.predict(data_features_test)))

clf_xg = GradientBoostingClassifier(learning_rate=0.15,
                                    n_estimators=70,
                                    min_samples_split=0.5,
                                    min_samples_leaf=45,
                                    max_depth=8,
                                    max_features='sqrt',
                                    subsample=0.8)
clf_xg.fit(x_train_res, y_train_res)
コード例 #6
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from src.evaluation_methods import *
from imblearn.under_sampling import RandomUnderSampler

if __name__ == '__main__':
    df = pd.read_excel('../../data_base/excel/datasetV2.xlsx', sheet_name='Casos Dengue')

    x, y = df.iloc[:, :-1].values, df.iloc[:, -1:].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

    under_sampler = RandomUnderSampler()
    x_train, y_train = under_sampler.fit_resample(x_train, y_train)
    #us = NearMiss(n_neighbors=3, version=2)
    #X_train_res, y_train_res = us.fit_sample(x_train, y_train)

    print("Distribution before resampling {}".format(y_train.shape))

    bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                    sampling_strategy='auto',
                                    replacement=False,
                                    random_state=0)

    # Train the classifier.
    bbc.fit(x_train, y_train)
    pred_y = bbc.predict(x_test)
    y_predict = bbc.predict(x_test)
    print('Test Accuracy: %.3f' % bbc.score(x_test, y_test))
    confusionMatrix(y_test, y_predict)