def test_iht_wrong_estimator():
    ratio = 0.7
    est = 'rnd'
    iht = InstanceHardnessThreshold(estimator=est,
                                    ratio=ratio,
                                    random_state=RND_SEED)
    with raises(NotImplementedError):
        iht.fit_sample(X, Y)
def test_iht_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.7
    iht = InstanceHardnessThreshold(ESTIMATOR, ratio=ratio,
                                    random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251],
                     [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941],
                     [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479],
                     [-0.43877303, 1.07366684],
                     [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473],
                     [-0.30126957, -0.66268378],
                     [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991],
                     [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502],
                     [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True,
                                    random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251],
                     [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941],
                     [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684],
                     [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473],
                     [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991],
                     [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502],
                     [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    idx_gt = np.array([0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_iht_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_decision_tree():
    """Test the fit sample routine with decision-tree"""

    # Resample the data
    est = 'decision-tree'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_dt.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_dt.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_gradient_boosting():
    """Test the fit sample routine with gradient boosting"""

    # Resample the data
    est = 'gradient-boosting'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_gb.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_gb.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    """Test the fit sample routine with linear SVM"""

    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_svm.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_svm.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_decision_tree():
    """Test the fit sample routine with decision-tree"""

    # Resample the data
    est = 'decision-tree'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_dt.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_dt.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_gradient_boosting():
    """Test the fit sample routine with gradient boosting"""

    # Resample the data
    est = 'gradient-boosting'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_gb.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_gb.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    """Test the fit sample routine with linear SVM"""

    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x_svm.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y_svm.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample():
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.28305528, 0.30284991]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_knn():
    est = 'knn'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189], [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [0.20246714, -0.34727125]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True,
                                    random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'iht_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_iht_fit_sample_with_indices():
    """Test the fit sample routine with indices support"""

    # Resample the data
    iht = InstanceHardnessThreshold(ESTIMATOR, return_indices=True,
                                    random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'iht_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'iht_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'iht_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_iht_fit_sample_linear_svm():
    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479], [-0.43877303, 1.07366684],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_class_obj():
    """Test the fit sample routine passing a classifiermixin object"""

    # Resample the data
    est = GradientBoostingClassifier(random_state=RND_SEED)
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_with_indices():
    # Resample the data
    iht = InstanceHardnessThreshold(
        ESTIMATOR, return_indices=True, random_state=RND_SEED)
    X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    idx_gt = np.array([0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Beispiel #19
0
def test_iht_fit_sample_gradient_boosting():
    """Test the fit sample routine with gradient boosting"""

    # Resample the data
    est = 'gradient-boosting'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_linear_svm():
    """Test the fit sample routine with linear SVM"""

    # Resample the data
    est = 'linear-svm'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479], [-0.43877303, 1.07366684],
                     [-0.18430329, 0.52328473], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_knn():
    """Test the fit sample routine with knn"""

    # Resample the data
    est = 'knn'
    iht = InstanceHardnessThreshold(est, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.43877303, 1.07366684], [-0.85795321, 0.82980738],
                     [-0.30126957, -0.66268378], [-0.65571327, 0.42412021],
                     [0.20246714, -0.34727125], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample():
    iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251],
                     [0.91542919, -0.65453327],
                     [-0.65571327, 0.42412021],
                     [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502],
                     [-0.00717161, 0.00318087],
                     [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941],
                     [-0.43877303, 1.07366684],
                     [-0.85795321, 0.82980738],
                     [-0.18430329, 0.52328473],
                     [-0.28305528, 0.30284991]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Beispiel #23
0
def test_iht_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.7
    iht = InstanceHardnessThreshold(
        ESTIMATOR, ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = iht.fit_sample(X, Y)

    X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189],
                     [-0.77740357, 0.74097941], [0.91542919, -0.65453327],
                     [-0.03852113, 0.40910479], [-0.43877303, 1.07366684],
                     [-0.85795321, 0.82980738], [-0.18430329, 0.52328473],
                     [-0.30126957, -0.66268378], [-0.65571327, 0.42412021],
                     [-0.28305528, 0.30284991], [1.06446472, -1.09279772],
                     [0.30543283, -0.02589502], [-0.00717161, 0.00318087]])
    y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_iht_fit_sample_wrong_class_obj():
    from sklearn.cluster import KMeans
    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        iht.fit_sample(X, Y)
def test_iht_fit_sample_wrong_class_obj():
    from sklearn.cluster import KMeans
    est = KMeans()
    iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        iht.fit_sample(X, Y)
pca = PCA(n_components=2)
X_vis = pca.fit_transform(X)

# Two subplots, unpack the axes array immediately
f, axs = plt.subplots(2, 2)

axs = [a for ax in axs for a in ax]
for ax, ratio in zip(axs, [0.0, 0.1, 0.3, 0.5]):
    if ratio == 0.0:
        ax.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                    alpha=0.5, edgecolor=almost_black, facecolor=palette[0],
                    linewidth=0.15)
        ax.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                   alpha=0.5, edgecolor=almost_black, facecolor=palette[2],
                   linewidth=0.15)
        ax.set_title('Original set')
    else:
        iht = InstanceHardnessThreshold(ratio=ratio)
        X_res, y_res = iht.fit_sample(X, y)
        X_res_vis = pca.transform(X_res)

        ax.scatter(X_res_vis[y_res == 0, 0], X_res_vis[y_res == 0, 1],
                   label="Class #0", alpha=.5, edgecolor=almost_black,
                   facecolor=palette[0], linewidth=0.15)
        ax.scatter(X_res_vis[y_res == 1, 0], X_res_vis[y_res == 1, 1],
                   label="Class #1", alpha=.5, edgecolor=almost_black,
                   facecolor=palette[2], linewidth=0.15)
        ax.set_title('Instance Hardness Threshold ({})'.format(ratio))

plt.show()
Beispiel #27
0
dataset.drop(dataset.columns[[26, 27]], axis=1, inplace=True)

values = dataset.values
X = values[:, 0:33]
y = values[:, 33]

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

imputer = Imputer(strategy='median')
X = imputer.fit_transform(X)

iht = InstanceHardnessThreshold(random_state=12)
X = X.astype(int)
y = y.astype(int)
X, y = iht.fit_sample(X, y)
#print('Amount of each class after under-sampling: {0}'.format(Counter(y)))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=12)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg.predict(X_test)

app = Flask(__name__)


@app.route('/')
Beispiel #28
0
                   label="Class #0",
                   alpha=0.5,
                   edgecolor=almost_black,
                   facecolor=palette[0],
                   linewidth=0.15)
        ax.scatter(X_vis[y == 1, 0],
                   X_vis[y == 1, 1],
                   label="Class #1",
                   alpha=0.5,
                   edgecolor=almost_black,
                   facecolor=palette[2],
                   linewidth=0.15)
        ax.set_title('Original set')
    else:
        iht = InstanceHardnessThreshold(ratio=ratio)
        X_res, y_res = iht.fit_sample(X, y)
        X_res_vis = pca.transform(X_res)

        ax.scatter(X_res_vis[y_res == 0, 0],
                   X_res_vis[y_res == 0, 1],
                   label="Class #0",
                   alpha=.5,
                   edgecolor=almost_black,
                   facecolor=palette[0],
                   linewidth=0.15)
        ax.scatter(X_res_vis[y_res == 1, 0],
                   X_res_vis[y_res == 1, 1],
                   label="Class #1",
                   alpha=.5,
                   edgecolor=almost_black,
                   facecolor=palette[2],
Beispiel #29
0
'''
NeighbourhoodCleaningRule 算法主要关注如何清洗数据而不是筛选(considering)他们. 因此,该算法将使用
EditedNearestNeighbours和 3-NN分类器结果拒绝的样本之间的并集.
'''
from imblearn.under_sampling import NeighbourhoodCleaningRule
ncr = NeighbourhoodCleaningRule(random_state=0)
X_resampled, y_resampled = ncr.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

#InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉.
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import InstanceHardnessThreshold
iht = InstanceHardnessThreshold(random_state=0,
                              estimator=LogisticRegression())
X_resampled, y_resampled = iht.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
#[(0, 64), (1, 64), (2, 64)]


'''
过采样与下采样的结合
在之前的SMOTE方法中,当由边界的样本与其他样本进行过采样差值时,很容易生成一些噪音数据. 
因此,在过采样之后需要对样本进行清洗.这样,第三节中涉及到的TomekLink与 EditedNearestNeighbours方法就能实现上述的要求.
所以就有了两种结合过采样与下采样的方法:(i) SMOTETomek and (ii) SMOTEENN.
'''
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))
# hamloss=hamming_loss(y_test_ncr,pred10)
# print('ncr_hamming_loss',hamloss)
'''
ncr_score: [0.77144204 0.77155985 0.77129728 0.77165076 0.7719505 ]
ncr_accuracy_score: 0.7716271945328149
ncr_f1_score: 0.771627194532815
ncr_cohen_kappa_score: 0.0005376844273261572
ncr_hamming_loss 0.2283728054671851
'''

#InstanceHardnessThreshold是一种很特殊的方法,是在数据上运用一种分类器,然后将概率低于阈值的样本剔除掉.
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.linear_model import LogisticRegression
iht = InstanceHardnessThreshold(random_state=0,
                              estimator=LogisticRegression())
X_resampled_iht, y_resampled_iht = iht.fit_sample(train_set_1_1, label)
print(sorted(Counter(y_resampled_iht).items()))
x_train_iht,x_test_iht,y_train_iht,y_test_iht=train_test_split(X_resampled_iht, y_resampled_iht ,random_state=1)
svm_clf.fit(x_train_iht, y_train_iht)
#joblib.dump(svm_clf,'../model/iht_sample_model.pkl')
 
#tl评估
from sklearn.model_selection import cross_val_score
scores=cross_val_score(svm_clf,x_test_iht,y_test_iht,cv=5)
print('iht_score:',scores)
pred11 = svm_clf.predict(x_test_iht)
print('iht_accuracy_score:',metrics.accuracy_score(y_test_iht, pred11))
print('iht_f1_score:',metrics.f1_score(y_test_iht, pred11,average="micro"))
from sklearn.metrics import cohen_kappa_score#Kappa系数是基于混淆矩阵的计算得到的模型评价参数
kappa = cohen_kappa_score(y_test_iht,pred11)
print('iht_cohen_kappa_score:',kappa)