def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y)
    est = BalancedRandomForestClassifier(
        oob_score=True,
        random_state=0,
        n_estimators=1000,
        min_samples_leaf=2,
    )

    est.fit(X_train, y_train)
    test_score = est.score(X_test, y_test)

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True,
                                         random_state=0,
                                         n_estimators=1,
                                         bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
Example #2
0
def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0)

    n_samples = X.shape[0]
    est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
    test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0,
                                         n_estimators=1, bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)
train_accuracy = []
validation_accuracy = []
label_prop = []
#
#
# NO Propagation labels
#
#

x_tr, y_tr, x_te, y_te, x_va, y_va = load_known_data()

model_name.append("Balanced Random Forest")
label_prop.append("No Propagation")
rfb = BalancedRandomForestClassifier(max_depth=2)
rfb.fit(x_tr, y_tr)
train_accuracy.append(rfb.score(x_tr, y_tr))
test_accuracy.append(rfb.score(x_te, y_te))
validation_accuracy.append(rfb.score(x_va, y_va))

model_name.append("Easy Ensemble")
label_prop.append("No Propagation")
clf = EasyEnsembleClassifier(random_state=0)
clf.fit(x_tr, y_tr)
clf.predict(x_tr)
train_accuracy.append(clf.score(x_tr, y_tr))
test_accuracy.append(clf.score(x_te, y_te))
validation_accuracy.append(clf.score(x_va, y_va))

#
#
# Propagation labels
Example #4
0
cl = KNeighborsClassifier(n_neighbors=3)
cl.fit(X_train, y_train)

clf = BalancedRandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

loo = KFold(n_splits=10)
scores = cross_val_score(clf,
                         X_resampled,
                         y_resampled,
                         scoring="accuracy",
                         cv=loo)
print(scores)
print(scores.mean())

print("Before{} ".format(clf.score(X_test, y_test)))
print("BeforeNei{} ".format(cl.score(X_test, y_test)))
score = fisher_score.fisher_score(X_train, y_train)
print(len(score))

idx = fisher_score.feature_ranking(score)
print(idx)
num_fea = 6
X1 = X_resampled.iloc[:, [
    idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8],
    idx[9], idx[10], idx[11]
]]

#X1 = X_resampled.iloc[:, [idx[0], idx[1], idx[2], idx[3], idx[4],idx[5]]]
X1 = pd.DataFrame(X1)
print("Selected features {}".format(X1.columns.values))
Example #5
0
def BalancedRF_classifier(df, y_column, feature_columns, test_rate):

    # 不均衡クラス分類用ランダムフォレスト
    # 混合行列や重要度の高い変数を可視化する

    # 説明変数、目的変数の作成
    X = df.loc[:, feature_columns].values
    Y = df.loc[:, y_column].values

    # 学習用、検証用データに分割
    (X_train, X_test, Y_train, Y_test) = train_test_split(X,
                                                          Y,
                                                          test_size=test_rate,
                                                          random_state=123,
                                                          shuffle=True)
    '''
    # モデル構築、パラメータはデフォルト
    parameters = {
        'n_estimators'      : [5, 10, 20, 30, 50],
        'max_features'      : [3, 5, 10, 15, 20],
        'random_state'      : [0],
        'n_jobs'            : [2],
        'min_samples_split' : [3, 5, 10, 15, 20, 25, 30],
        'max_depth'         : [3, 5, 10, 15, 20, 25, 30, 50, 100]
    }
    clf = GridSearchCV(RandomForestClassifier(), parameters)
    clf.fit(X_train, Y_train)
    print(clf.best_estimator_)'''

    model = BalancedRandomForestClassifier(n_jobs=1,
                                           n_estimators=30,
                                           sampling_strategy='not minority')

    print(model.get_params())
    model.fit(X_train, Y_train)

    # 正解率
    print("正解率 : " + str(model.score(X_test, Y_test) * 100) + "%")
    print("訓練データの正解率 : " + str(model.score(X_train, Y_train) * 100) + "%")

    # confusion matrix を確認する
    print("confusion matrix")
    prediction = model.predict(X_test)
    labels = list(set(Y))
    print_cmx(Y_test, prediction, labels)

    # 効いてる変数を調べる
    importances = None
    i = np.array([e.feature_importances_ for e in model.estimators_])
    avg_i = np.array([e.feature_importances_
                      for e in model.estimators_]).mean(axis=0)

    importances = pd.DataFrame({
        'variable': feature_columns,
        'importance': avg_i
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    display(importances)

    IMP = importances.copy()
    plt.figure(figsize=(5, 7))
    plt.plot(IMP.importance,
             sorted([i + 1 for i in range(IMP.shape[0])], reverse=True), 'o-')
    plt.yticks(sorted([i + 1 for i in range(IMP.shape[0])], reverse=True),
               IMP.variable)
    plt.xlabel('importance')
    # plt.xlabel('重要度')
    plt.show()

    return model, importances, (X_train, X_test, Y_train, Y_test)
print("_____________________________________")
#%% BALANCED random forest classifier - Random undersampling of the majority class in reach bootstrap sample.
from imblearn.ensemble import BalancedRandomForestClassifier

print("_____________________________________ \n Balanced Random Forest")
# all features
clf_brf_all = BalancedRandomForestClassifier(n_estimators=1000,
                                             random_state=0,
                                             n_jobs=-1,
                                             max_depth=4,
                                             min_samples_split=0.05).fit(
                                                 X_train,
                                                 y_train.values.ravel())
print(f"All features results: \n",
      f"{list(loss_intensity.columns.values)[0]} - All training score is",
      clf_brf_all.score(X_train, y_train.values.ravel()))
print(f"{list(loss_intensity.columns.values)[0]} - All test score is",
      clf_brf_all.score(X_test, y_test.values.ravel()))
y_pred = clf_brf_all.predict(X_test)

#select most important ones
sel = SelectFromModel(BalancedRandomForestClassifier(n_estimators=1000,
                                                     random_state=0),
                      max_features=5)
sel.fit(X_train, y_train.values.ravel())
selected_feat = X_train.columns[(sel.get_support())]
print("\n Balanced Random Forest \n The selected features are",
      len(selected_feat), selected_feat.values)
# transform
X_train_selected = sel.transform(X_train)
X_test_selected = sel.transform(X_test)