def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]:
        clf = BalancedBaggingClassifier(base_estimator=base_estimator,
                                        n_estimators=100,
                                        bootstrap=True,
                                        oob_score=True,
                                        random_state=0).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        assert_warns(
            UserWarning,
            BalancedBaggingClassifier(base_estimator=base_estimator,
                                      n_estimators=1,
                                      bootstrap=True,
                                      oob_score=True,
                                      random_state=0).fit, X_train, y_train)
Example #2
0
def objectiveBalance(params):
    time1 = time.time()
    params = {
        'sampling_strategy': params['sampling_strategy'],
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 5
    count = 1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    score_mean = 0
    for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()):
        clf = BalancedBaggingClassifier(**params,
                                    random_state=0,
                                    n_estimators=300,
                                    n_jobs=-1,
                                    verbose=0)

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr.values.ravel())
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators,
                                               random_state=random_state,
                                               warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = BalancedBaggingClassifier(n_estimators=10,
                                          random_state=random_state,
                                          warm_start=False)
    clf_no_ws.fit(X, y)

    assert ({pipe.steps[-1][1].random_state
             for pipe in clf_ws
             } == {pipe.steps[-1][1].random_state
                   for pipe in clf_no_ws})
def test_oob_score_classification():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    for base_estimator in [DecisionTreeClassifier(), SVC()]:
        clf = BalancedBaggingClassifier(
            base_estimator=base_estimator,
            n_estimators=100,
            bootstrap=True,
            oob_score=True,
            random_state=0).fit(X_train, y_train)

        test_score = clf.score(X_test, y_test)

        assert abs(test_score - clf.oob_score_) < 0.1

        # Test with few estimators
        assert_warns(UserWarning,
                     BalancedBaggingClassifier(
                         base_estimator=base_estimator,
                         n_estimators=1,
                         bootstrap=True,
                         oob_score=True,
                         random_state=0).fit,
                     X_train,
                     y_train)
Example #5
0
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0,
    ).fit(X_train, y_train)

    clf2 = make_pipeline(
        RandomUnderSampler(
            random_state=clf1.estimators_[0].steps[0][1].random_state),
        KNeighborsClassifier(),
    ).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    assert_raises(ValueError, clf.fit, X, y)
def test_bootstrap_features():
    # Test that bootstrapping features may generate duplicate features.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_features=1.0,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert np.unique(features).shape[0] == X.shape[1]

    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_features=1.0,
        bootstrap_features=True,
        random_state=0).fit(X_train, y_train)

    unique_features = [
        np.unique(features).shape[0]
        for features in ensemble.estimators_features_
    ]
    assert np.median(unique_features) < X.shape[1]
Example #8
0
def cross_validation(name):
    with open('../data/conv_pred/train_data_ad_ignore_' + name + '.pickle',
              'rb') as f:
        data = pickle.load(f)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])
    kf = KFold(n_splits=5)
    fscore = 0
    ftscore = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #model = RandomForestClassifier(n_estimators=100, n_jobs=8,class_weight={0:1,1:3000})
        model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8)
        model.fit(X_train, y_train)
        predict = model.predict_proba(X_test)
        score, t_score = eval(y_test, predict)
        pprint(
            sorted(zip(
                np.mean([
                    est.steps[1][1].feature_importances_
                    for est in model.estimators_
                ],
                        axis=0), v.feature_names_),
                   key=lambda x: x[0],
                   reverse=True))
        print('score : ', str(score))
        print('true_score : ', str(t_score))
        fscore += score
        ftscore += t_score
    print('\n')
    print('final score : ', str(fscore / 10))
    print('final true_score : ', str(ftscore / 10))
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    base_estimator = DecisionTreeClassifier().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    # disable the resampling by passing an empty dictionary.
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=False,
        n_estimators=10,
        ratio={},
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) ==
            base_estimator.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BalancedBaggingClassifier(
        base_estimator=DecisionTreeClassifier(),
        max_samples=1.0,
        bootstrap=True,
        random_state=0).fit(X_train, y_train)

    assert (ensemble.score(X_train, y_train) <
            base_estimator.score(X_train, y_train))
def balanced_bragging(X_train, y_train, X_test, y_test, X_train_res, y_train_res):
    bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    bagging.fit(X_train, y_train.values.ravel())
    y_train_bc = bagging.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_bc)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Niezbalansowane (bragging): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    bagging_oversampling = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    bagging_oversampling.fit(X_train_res, y_train_res.ravel())
    y_train_bc = bagging_oversampling.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_bc)
    with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("z oversamplingiem (bragging): {}%".format(with_oversampling))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    balanced_bagging.fit(X_train, y_train.values.ravel())
    y_train_bbc = balanced_bagging.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_bbc)
    within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Zbalansowane (bragging): {}%".format(within))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    objects = ('Bragging','Bragging z oversamplingiem SMOTE', 'Bragging z losowym undersamplingiem')
    y_pos = np.arange(len(objects))
    performance = [without,with_oversampling, within]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność braggingu')
    plt.show()
    return without, within
def test_base_estimator():
    # Check base_estimator and its default values.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ensemble = BalancedBaggingClassifier(None, n_jobs=3,
                                         random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      DecisionTreeClassifier)

    ensemble = BalancedBaggingClassifier(DecisionTreeClassifier(),
                                         n_jobs=3,
                                         random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      DecisionTreeClassifier)

    ensemble = BalancedBaggingClassifier(Perceptron(max_iter=1000, tol=1e-3),
                                         n_jobs=3,
                                         random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron)
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(),
            random_state=0,
            max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
Example #13
0
def test_warm_start_with_oob_score_fails():
    # Check using oob_score and warm_start simultaneously fails
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BalancedBaggingClassifier(n_estimators=5,
                                    warm_start=True,
                                    oob_score=True)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Example #14
0
def train_knn_model(df_formatted, true_labels, iteration=0):
    classifier = BalancedBaggingClassifier(
        n_estimators=5,
        base_estimator=KNeighborsClassifier(n_neighbors=5),
        random_state=0,
        n_jobs=-1)
    classifier.fit(df_formatted, true_labels)
    save_model(classifier, iteration)
Example #15
0
def imblearn_(classifier, X_train, y_train, X_test, y_test):
    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto',
                                    random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    printStats(y_test, y_pred)
    return clf, y_pred
Example #16
0
def train_tree_model(X_train, y_train):
    classifier = BalancedBaggingClassifier(
        n_estimators=5,
        base_estimator=DecisionTreeClassifier(),
        random_state=0,
        n_jobs=-1)
    classifier.fit(X_train, y_train)
    save_model(classifier)
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = BalancedBaggingClassifier(
        make_pipeline(SelectKBest(k=1),
                      DecisionTreeClassifier()),
        max_features=2)
    estimator.fit(X, y).predict(X)
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = BalancedBaggingClassifier(
        make_pipeline(SelectKBest(k=1),
                      DecisionTreeClassifier()),
        max_features=2)
    estimator.fit(X, y).predict(X)
Example #19
0
def train_nb_model(X_train, y_train, vectorize=False, iteration=0):
    # printCsv(print_df, "train")
    print("train_nb_model", iteration)
    start_time = time.time()

    tfidf = TfidfVectorizer(
        sublinear_tf=True,
        # min_df=5,
        norm='l2',
        encoding='latin-1',
        ngram_range=(1, 2),
        stop_words='english')
    features = tfidf.fit_transform(X_train).toarray()
    labels = y_train
    print(features.shape)

    # from sklearn.feature_selection import chi2
    # import numpy as np
    # N = 10
    # a = labels == False
    # features_chi2 = chi2(features, a)
    # indices = np.argsort(features_chi2[0])
    # feature_names = np.array(tfidf.get_feature_names())[indices]
    # unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    # bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    # printCsv(pd.DataFrame(feature_names), "tfidf")
    # print("# asd")
    # print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
    #     unigrams[0:N])))
    # print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
    #     unigrams[-N:])))
    # print("  . Most correlated unigrams:\n. {}".format('\n. '.join(
    #     bigrams[0:N])))
    # print("  . Most correlated bigrams:\n. {}".format('\n. '.join(
    #     bigrams[-N:])))
    # print("Training")
    tfidf_train = tfidf.fit(X_train)
    save_model_2(tfidf_train, iteration)
    bow_features = tfidf_train.transform(X_train)

    text_clf = BalancedBaggingClassifier(n_estimators=5,
                                         base_estimator=LinearSVC(),
                                         random_state=0)
    #train
    text_clf = text_clf.fit(bow_features, y_train)
    save_model(text_clf, iteration)
    print("modle saved")

    if vectorize:
        text_clf = Pipeline([('vect',
                              TfidfVectorizer(sublinear_tf=True,
                                              norm='l2',
                                              ngram_range=(1, 2),
                                              stop_words='english')),
                             ('clf', LinearSVC())])
        text_clf = text_clf.fit(X_train, y_train)

        save_model(text_clf)
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BalancedBaggingClassifier(KNeighborsClassifier(),
                                        max_samples=0.5,
                                        max_features=0.5, oob_score=True,
                                        random_state=1)
    assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
class Classifier(BaseEstimator):
    def __init__(self):
        self.reg = BalancedBaggingClassifier(n_estimators=50, random_state=42)

    def fit(self, X, y):
        self.reg.fit(X, y)

    def predict(self, X):
        return self.reg.predict(X)
Example #22
0
 def __init__(self):
     # mimicking balanced random forest with the BalancedBaggingClassifier
     # and DecisionTreeClassifier combination
     self.bbc = BalancedBaggingClassifier(
         base_estimator=DecisionTreeClassifier(max_features='auto'),
         ratio=determine_ratio,
         random_state=0,
         n_estimators=50,
         n_jobs=1)
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BalancedBaggingClassifier(KNeighborsClassifier(),
                                        max_samples=0.5,
                                        max_features=0.5, oob_score=True,
                                        random_state=1)
    assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
    bagging = BalancedBaggingClassifier(KNeighborsClassifier(),
                                        max_samples=max_samples,
                                        max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert bagging._max_samples == max_samples
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
    bagging = BalancedBaggingClassifier(KNeighborsClassifier(),
                                        max_samples=max_samples,
                                        max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert bagging._max_samples == max_samples
Example #26
0
def buildModel(X, y):
    # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2]))
    print X.shape, y.shape
    scaler = StandardScaler()
    print(scaler.fit(X))
    scaled_train_x = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(scaled_train_x,
                                                        y,
                                                        random_state=19,
                                                        test_size=0.3)

    bag = BalancedBaggingClassifier(n_estimators=200, random_state=19)
    svm = SVC(class_weight='balanced',
              random_state=19,
              decision_function_shape='ovo')
    neural = MLPClassifier(max_iter=500,
                           random_state=19,
                           solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(49, 8, 4))
    ada = AdaBoostClassifier(n_estimators=100, random_state=19)
    logistic = LogisticRegression(solver='lbfgs', max_iter=500)

    bag.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    neural.fit(X_train, y_train)
    ada.fit(X_train, y_train)
    logistic.fit(X_train, y_train)
    # joblib.dump(bag,'bag.pkl')
    # joblib.dump(scaler,'scaler.pkl')

    y_pred = bag.predict(X_test)
    y_pred2 = svm.predict(X_test)
    y_pred3 = neural.predict(X_test)
    y_pred4 = ada.predict(X_test)
    y_pred5 = logistic.predict(X_test)

    print matthews_corrcoef(y_test, y_pred)
    print matthews_corrcoef(y_test, y_pred2)
    print matthews_corrcoef(y_test, y_pred3)
    print matthews_corrcoef(y_test, y_pred4)
    print matthews_corrcoef(y_test, y_pred5)

    print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test, y_pred2)
    print confusion_matrix(y_test, y_pred3)
    print confusion_matrix(y_test, y_pred4)
    print confusion_matrix(y_test, y_pred5)

    print(classification_report_imbalanced(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred2))
    print(classification_report_imbalanced(y_test, y_pred3))
    print(classification_report_imbalanced(y_test, y_pred4))
    print(classification_report_imbalanced(y_test, y_pred5))
Example #27
0
def RFC():
    train_data, train_target = get_data('data.txt', 'target.txt')
    model_rfc = BalancedBaggingClassifier(
        n_estimators=100,
        base_estimator=DecisionTreeClassifier(),
        sampling_strategy='auto',
        replacement=False,
        random_state=0)
    model_rfc.fit(train_data, train_target)
    save_path_name = '../../model/' + 'rfc.m'
    joblib.dump(model_rfc, save_path_name)
Example #28
0
def cross_validation(x):
    with open('../data/conv_pred/train_data_' + x + '.pickle', 'rb') as f:
        data = pickle.load(f)
    print(data)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])

    zero = 0
    one = 0
    for i in y:
        if i == 0:
            zero += 1
        else:
            one += 1
    print(zero)
    print(one)

    cv = 5
    kf = KFold(n_splits=cv)
    fscore = 0
    ftscore = 0
    all_f_value = 0
    all_prec = 0
    for train_index, test_index in tqdm(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #model = RandomForestRe(n_estimators=100, n_jobs=8)
        model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8)
        #model = xgb.XGBClassifier(n_estimators=500,max_delta_step=1,scale_pos_weight=zero/one)
        model.fit(X_train, y_train)
        predict = model.predict_proba(X_test)
        precision, recall, f_value, all_pre = eval(y_test, predict)
        all_prec += all_pre
        fscore += precision
        ftscore += recall
        all_f_value += f_value
    pprint(
        sorted(zip(
            np.mean([
                est.steps[1][1].feature_importances_
                for est in model.estimators_
            ],
                    axis=0), v.feature_names_),
               key=lambda x: x[0],
               reverse=True))
    print('\n')
    print('final precision : ', str(fscore / cv))
    print('final recall : ', str(ftscore / cv))
    print('final f-value : ', str(all_f_value / cv))
    print('final all_precision : ', str(all_prec / cv))
Example #29
0
class Classifier(BaseEstimator):
    def __init__(self):
        # mimicking balanced random forest with the BalancedBaggingClassifier
        # and DecisionTreeClassifier combination
        self.bbc = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(max_features='auto'),
            ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1)

    def fit(self, X, y):
        self.bbc.fit(X, y)

    def predict_proba(self, X):
        return self.bbc.predict_proba(X)
def switch_algorithm(algr):
    switcher = {
        1: ('knn', KNeighborsClassifier()),
        2: ('lr', LogisticRegression(solver='liblinear')),
        3: ('dt', DecisionTreeClassifier()),
        4: ('xtr', ExtraTreesClassifier()),
        5: ('rf', RandomForestClassifier()),
        6: ('gbt', GradientBoostingClassifier()),
        7: ('mlp', MLPClassifier()),
        8: ('bnb', BernoulliNB()),
        9: ('gnb', GaussianNB()),
        10: ('polysvc', SVC()),
        11: ('sigmsvc', SVC()),
        12: ('rbfsvc', SVC()),
        13: ('lsvc', SVC()),
        14: ('lbsvc', LinearSVC()),
        15: ('bsvc', BalancedBaggingClassifier(SVC(kernel='linear', probability=True), sampling_strategy='not majority')),
        16: ('absvc', BalancedBaggingClassifier(SVC(kernel='linear', probability=True), sampling_strategy='all')),
        17: ('ccsvc', CalibratedClassifierCV()),
        18: ('bbnb', BalancedBaggingClassifier()),
        19: ('blsvc', BalancedBaggingClassifier()),
        20: ('bsvc',BalancedBaggingClassifier()),
        21: ('bsvcsig', BalancedBaggingClassifier()),
        22: ('xgbt', XGBClassifier(n_thread=-1)),
        23: ('bxgbt', BalancedBaggingClassifier(XGBClassifier(n_thread=-1))),
        24: ('bgbt', BalancedBaggingClassifier(GradientBoostingClassifier())),
        25: ('adb', AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight='balanced'))),
        26: ('lgbm', lgb.LGBMClassifier(silent=True, class_weight='balanced')),
        27: ('catb', cb.CatBoostClassifier(silent=True))
    }
    # print(switcher.get(algr, "Invalid algorithm"))
    return switcher.get(algr, "Invalid algorithm")
Example #31
0
def test_balanced_bagging_classifier_with_function_sampler(replace):
    # check that we can provide a FunctionSampler in BalancedBaggingClassifier
    X, y = make_classification(
        n_samples=1_000,
        n_features=10,
        n_classes=2,
        weights=[0.3, 0.7],
        random_state=0,
    )

    def roughly_balanced_bagging(X, y, replace=False):
        """Implementation of Roughly Balanced Bagging for binary problem."""
        # find the minority and majority classes
        class_counts = Counter(y)
        majority_class = max(class_counts, key=class_counts.get)
        minority_class = min(class_counts, key=class_counts.get)

        # compute the number of sample to draw from the majority class using
        # a negative binomial distribution
        n_minority_class = class_counts[minority_class]
        n_majority_resampled = np.random.negative_binomial(n=n_minority_class,
                                                           p=0.5)

        # draw randomly with or without replacement
        majority_indices = np.random.choice(
            np.flatnonzero(y == majority_class),
            size=n_majority_resampled,
            replace=replace,
        )
        minority_indices = np.random.choice(
            np.flatnonzero(y == minority_class),
            size=n_minority_class,
            replace=replace,
        )
        indices = np.hstack([majority_indices, minority_indices])

        return X[indices], y[indices]

    # Roughly Balanced Bagging
    rbb = BalancedBaggingClassifier(
        base_estimator=CountDecisionTreeClassifier(),
        n_estimators=2,
        sampler=FunctionSampler(func=roughly_balanced_bagging,
                                kw_args={"replace": replace}),
    )
    rbb.fit(X, y)

    for estimator in rbb.estimators_:
        class_counts = estimator[-1].class_counts_
        assert (class_counts[0] / class_counts[1]) > 0.8
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(),
            random_state=0,
            max_samples=5).fit(X_train, y_train)

        assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test),
                                         axis=1),
                                  np.ones(len(X_test)))

        assert_array_almost_equal(ensemble.predict_proba(X_test),
                                  np.exp(ensemble.predict_log_proba(X_test)))
def clf_wrapper(classifier, X_train, y_train, X_test, y_test):
    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto', 
                                    replacement=False, 
                                    random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cfm = confusion_matrix(y_test, y_pred)
    
    # Predictive Value
    PPV = cfm[0,0]/(cfm[0,0]+cfm[0,1])
    NPV = cfm[1,1]/(cfm[1,0]+cfm[1,1])
    ACR = (cfm[0,0]+cfm[1,1])/(cfm[0,0]+cfm[1,1]+cfm[1,0]+cfm[0,1])
    return (PPV+NPV+ACR)/3
Example #34
0
def classifier_imblearn_SVM_training(_X, _Y, _weight):
    X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split(
        _X, _Y, _weight, test_size=0.2, random_state=0xdeadbeef)
    bbc = BalancedBaggingClassifier(base_estimator=SVC(kernel="rbf",
                                                       gamma="auto"),
                                    n_estimators=10,
                                    sampling_strategy="auto",
                                    max_samples=80,
                                    replacement=False,
                                    random_state=0xdeadbeef)
    bbc.fit(X_train, Y_train)
    y_pred = bbc.predict(X_test)
    print("Result from bagging labeled SVM:")
    print("tn, fp, fn, tp =", confusion_matrix(Y_test, y_pred).ravel())
Example #35
0
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    with pytest.raises(ValueError):
        clf.fit(X, y)
def test_balanced_bagging_classifier():
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    grid = ParameterGrid({
        "max_samples": [0.5, 1.0],
        "max_features": [1, 2, 4],
        "bootstrap": [True, False],
        "bootstrap_features": [True, False]
    })

    for base_estimator in [
            None,
            DummyClassifier(),
            Perceptron(max_iter=1000, tol=1e-3),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            SVC(gamma='scale')
    ]:
        for params in grid:
            BalancedBaggingClassifier(base_estimator=base_estimator,
                                      random_state=0,
                                      **params).fit(X_train,
                                                    y_train).predict(X_test)
Example #37
0
    def create_for_training(
            classifier_clsname,
            feature_extractors: typing.Sequence[FeatureExtractorMixin],
            classifier_params=None):

        ## instantiate classifier
        if classifier_clsname == '__svm__':
            classifier = SVC(gamma=0.1, C=2)
        elif classifier_clsname == '__bagging_svm__':
            classifier = BalancedBaggingClassifier(
                base_estimator=SVC(gamma=0.1, C=2),
                n_estimators=10,
                bootstrap=False,
                sampling_strategy='majority')
        elif '.' in classifier_clsname:
            module_name, cls_name = classifier_clsname.rsplit('.', 1)
            module = importlib.import_module(module_name)
            classifier = getattr(module, cls_name)()
        else:
            classifier = globals()[classifier_clsname]()

        if classifier_params is not None:
            classifier.set_params(**classifier_params)

        extractors = [(str(idx), extractor)
                      for idx, extractor in enumerate(feature_extractors)]

        return SKLearnClassifierBasedTypeFilter(classifier, extractors)
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                    random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators"
                         " does not", clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    clf2 = make_pipeline(RandomUnderSampler(
        random_state=clf1.estimators_[0].steps[0][1].random_state),
                         KNeighborsClassifier()).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_oob_score_removed_on_warm_start():
    X, y = make_hastie_10_2(n_samples=2000, random_state=1)

    clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True)
    clf.fit(X, y)

    clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
    clf.fit(X, y)

    assert_raises(AttributeError, getattr, clf, "oob_score_")
Example #41
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)

    # remap the y outside of the BalancedBaggingclassifier
    # _, y = np.unique(y, return_inverse=True)
    bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs',
                                                           multi_class='auto'),
                                        max_samples=0.5,
                                        max_features=0.5, random_state=1,
                                        bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert len(estimators_samples) == len(estimators)
    assert len(estimators_samples[0]) == len(X) // 2
    assert estimators_samples[0].dtype.kind == 'i'

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.steps[-1][1].coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.steps[-1][1].coef_

    assert_allclose(orig_coefs, new_coefs)
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators,
                                               random_state=random_state,
                                               warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = BalancedBaggingClassifier(n_estimators=10,
                                          random_state=random_state,
                                          warm_start=False)
    clf_no_ws.fit(X, y)

    assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) ==
            set([pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


ozone = fetch_datasets()['ozone_level']
X, y = ozone.data, ozone.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

bagging = BaggingClassifier(random_state=0)
balanced_bagging = BalancedBaggingClassifier(random_state=0)

print('Class distribution of the training set: {}'.format(Counter(y_train)))

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

print('Class distribution of the test set: {}'.format(Counter(y_test)))

print('Classification results using a bagging classifier on imbalanced data')
y_pred_bagging = bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_bagging))
cm_bagging = confusion_matrix(y_test, y_pred_bagging)
plt.figure()
plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target),
                      title='Confusion matrix using BaggingClassifier')
              geometric_mean_score(y_test, y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=np.unique(satimage.target), ax=ax,
                      title='Decision tree')

###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0,
                                             n_jobs=-1)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                       random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False,
                                    random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)