Exemple #1
0
def test_sample_kmeans_density_estimation(data, density_exponent,
                                          cluster_balance_threshold):
    X, y = data
    smote = KMeansSMOTE(random_state=42,
                        density_exponent=density_exponent,
                        cluster_balance_threshold=cluster_balance_threshold)
    smote.fit_sample(X, y)
Exemple #2
0
def test_sample_kmeans_not_enough_clusters():
    rng = np.random.RandomState(42)
    X = rng.randn(30, 2)
    y = np.array([1] * 20 + [0] * 10)

    smote = KMeansSMOTE(random_state=42, kmeans_estimator=30, k_neighbors=2)
    with pytest.raises(RuntimeError):
        smote.fit_sample(X, y)
Exemple #3
0
def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator):
    X, y = data
    kmeans_smote = KMeansSMOTE(random_state=42,
                               kmeans_estimator=kmeans_estimator,
                               k_neighbors=k_neighbors)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, y)
    assert X_resampled.shape == (24, 2)
    assert y_resampled.shape == (24, )

    assert kmeans_smote.nn_k_.n_neighbors == 3
    assert kmeans_smote.kmeans_estimator_.n_clusters == 3
Exemple #4
0
def test_kmeans_smote(data):
    X, y = data
    kmeans_smote = KMeansSMOTE(kmeans_estimator=1,
                               random_state=42,
                               cluster_balance_threshold=0.0,
                               k_neighbors=5)
    smote = SMOTE(random_state=42)

    X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y)
    X_res_2, y_res_2 = smote.fit_sample(X, y)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)

    assert kmeans_smote.nn_k_.n_neighbors == 6
    assert kmeans_smote.kmeans_estimator_.n_clusters == 1
    assert 'batch_size' in kmeans_smote.kmeans_estimator_.get_params()
Exemple #5
0
    X_train = imp.fit_transform(X_train)  # 训练集插补
    X_test = imp.transform(X_test)  # 测试集插补

    prep = StandardScaler()
    X_train = prep.fit_transform(X_train)
    X_test = prep.transform(X_test)

    ops_ada = ADASYN(random_state=10)
    ops_bsmote = BorderlineSMOTE(random_state=10)
    ops_ksmote = KMeansSMOTE(random_state=10)
    ops_rs = RandomOverSampler(random_state=10)
    ops_s = SMOTE(random_state=10)

    X_train_ada, y_train_ada = ops_ada.fit_sample(X_train, y_train)
    X_train_bsmote, y_train_bsmote = ops_bsmote.fit_sample(X_train, y_train)
    X_train_ksmote, y_train_ksmote = ops_ksmote.fit_sample(X_train, y_train)
    X_train_rs, y_train_rs = ops_rs.fit_sample(X_train, y_train)
    X_train_s, y_train_s = ops_s.fit_sample(X_train, y_train)

    dic_ = {
        'ADASYN': [X_train_ada, y_train_ada],
        'BorderlineSMOTE': [X_train_bsmote, y_train_bsmote],
        'RandomOverSampler': [X_train_rs, y_train_rs],
        'SMOTE': [X_train_s, y_train_s]
    }

    for t in dic_.keys():
        print('over sampler: %s \n' % t)
        X_ = dic_[t][0]
        y_ = dic_[t][1]
        X_t = X_test
    res = adaBoost.predict(features[test_index])
    bl_smote_scores['AB'] += metrics.f1_score(res, target[test_index])
    bl_smote_con_mat['AB'] += confusion_matrix(y_true=target[test_index],
                                               y_pred=res)

    # Gradient Boost Classifier
    gradBoost = GradientBoostingClassifier(random_state=0)
    gradBoost.fit(X_train, y_train)
    res = gradBoost.predict(features[test_index])
    bl_smote_scores['GB'] += metrics.f1_score(res, target[test_index])
    bl_smote_con_mat['GB'] += confusion_matrix(y_true=target[test_index],
                                               y_pred=res)

    # K-Means Smote
    km_smote = KMeansSMOTE(random_state=0)
    X_train, y_train = km_smote.fit_sample(features[train_index],
                                           target[train_index])
    # unique, counts = np.unique(y_train, return_counts=True)
    # print("Kmeans uni, count:",np.asarray((unique, counts)).T)

    # Logistic Regression
    logistic = LogisticRegression(random_state=0)
    logistic.fit(X_train, y_train)
    res = logistic.predict(features[test_index])
    km_scores['LR'] += metrics.f1_score(res, target[test_index])
    km_con_mat['LR'] += confusion_matrix(y_true=target[test_index], y_pred=res)
    #
    # Ada Boost Classifier
    adaBoost = AdaBoostClassifier(random_state=0)
    adaBoost.fit(X_train, y_train)
    res = adaBoost.predict(features[test_index])
    km_scores['AB'] += metrics.f1_score(res, target[test_index])
y = np.ravel(y)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)
# forest.fit(X_train, y_train)
# print("Original set\n{}".format(classification_report(y_test, forest.predict(X_test))))
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X_train)

# Apply the random over-sampling
ada = KMeansSMOTE(random_state=42)
X_resampled, y_resampled = ada.fit_sample(X_train, y_train)
y_resampled = np.ravel(y_resampled)
forest.fit(X_resampled, y_resampled)
print(Counter(y_resampled))
print(y_resampled.shape)
X_res_vis = pca.transform(X_resampled)

print("KMeansSMOTE\n{}".format(
    classification_report(y_test, forest.predict(X_test))))

f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y_train == 0, 0],
                 X_vis[y_train == 0, 1],
                 label="Class #0",
                 alpha=0.5)
def kmeans_smote(x, y):
    print("----KMeans SMOTE----")
    sampler = KMeansSMOTE(random_state=42)
    X, y = sampler.fit_sample(x, y)
    return X, y