Example #1
0
def test_fit_resample_auto():
    sampling_strategy = 'auto'
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=True)
    X_resampled, y_resampled, idx_under = bc.fit_resample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]],
                     [[0.28893132, -0.38761769], [0.83680821, 1.72827342],
                      [0.3084254, 0.33299982], [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049], [0.77481731, 0.60935141],
                      [-0.18410027, -0.45194484], [1.15514042, 0.0129463],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    idx_gt = np.array(
        [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19],
         [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_fit_sample_auto_early_stop():
    """Test the fit and sample routine with auto ratio with 1 subset."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 1

    # Create the sampling object
    bc = BalanceCascade(
        ratio=ratio,
        random_state=RND_SEED,
        return_indices=True,
        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]])

    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
    idx_gt = np.array(
        [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]])
    # Check each array
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #3
0
def deep_ensemble_merged(smote=None):
    dt = DecisionTreeClassifier(max_features=0.2, random_state=KFOLD_SEED)
    ensembler = BalanceCascade(estimator=dt,
                               n_max_subset=10,
                               random_state=KFOLD_SEED)

    print("fitting sample")
    X_res, y_res = ensembler.fit_sample(features, labels_1d)
    print(X_res.shape, y_res.shape)

    print("training")

    # Merge sample batches
    Xs = None
    ys = None
    for i, X_train in enumerate(X_res):
        if Xs is None:
            Xs = np.array(X_res[i])
            ys = np.array(y_res[i])
            print(Xs.shape, ys.shape)
        else:
            Xs = np.concatenate((Xs, np.array(X_res[i])))
            ys = np.concatenate((ys, np.array(y_res[i])))

    print(Xs.shape, ys.shape)
    shuffle(Xs, ys)

    # Generate more synthetic samples
    if smote is not None:
        Xs, ys = smote.fit_sample(Xs, ys)

    shuffle(Xs, ys)
    ys = to_categorical(ys, 2)

    return Xs, ys
def test_give_classifier_wrong_obj():
    ratio = 'auto'
    classifier = 2
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True, estimator=classifier)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        bc.fit_sample(X, Y)
Example #5
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
def test_fit_sample_auto_early_stop():
    """Test the fit and sample routine with auto ratio with a static number
    of subsets."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 4

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_n_sub.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_n_sub.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_n_sub.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Example #7
0
def test_fit_sample_auto_early_stop():
    """Test the fit and sample routine with auto ratio with 1 subset."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 1

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]])

    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
    idx_gt = np.array(
        [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]])
    # Check each array
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #8
0
def test_fit_sample_half():
    ratio = 0.8
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.9281014, 0.53085498],
                      [0.3084254, 0.33299982],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #9
0
def ensemble_adaboost(feat, label):
    print(type(label))
    print(Counter(label))
    bm = BalanceCascade(random_state=19, estimator='adaboost')
    feat_res, label_res = bm.fit_sample(feat, label)
    print(label_res.shape)
    return feat_res, label_res
def test_fit_sample_half():
    ratio = {0: 8, 1: 10}
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[-0.41635887, -0.38299653],
                      [0.53366841, -0.30312976],
                      [1.25192108, -0.22367336],
                      [1.70580611, -0.11219234],
                      [1.52091956, -0.49283504],
                      [0.11622591, -0.0317206],
                      [1.31301027, -0.92648734],
                      [0.88407872, 0.35454207],
                      [0.3084254, 0.33299982],
                      [0.08711622, 0.93259929],
                      [-0.28162401, -2.10400981],
                      [-0.14374509, 0.27370049],
                      [0.9281014, 0.53085498],
                      [-0.18410027, -0.45194484],
                      [0.77481731, 0.60935141],
                      [1.15514042, 0.0129463],
                      [-1.11515198, -0.93689695],
                      [0.70472253, -0.73309052]]])
    y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_auto_gradient_boosting():
    """Test the fit and sample routine with auto ratio with a gradient
    boosting."""

    # Define the ratio parameter
    ratio = 'auto'
    classifier = 'gradient-boosting'

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        classifier=classifier)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_gb.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_gb.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx_gb.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Example #12
0
def test_give_classifier_obj():
    ratio = 'auto'
    classifier = RandomForestClassifier(random_state=RND_SEED)
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=False, estimator=classifier)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_give_classifier_obj():
    ratio = 'auto'
    estimator = RandomForestClassifier(random_state=RND_SEED)
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=False, estimator=estimator)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #14
0
def test_fit_sample_auto_linear_svm():
    ratio = 'auto'
    classifier = 'linear-svm'
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=False,
                        classifier=classifier)
    X_resampled, y_resampled = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]],
                     [[1.15514042, 0.0129463], [0.9281014, 0.53085498],
                      [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                      [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                      [0.70472253, -0.73309052], [0.77481731, 0.60935141],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    bc = BalanceCascade(random_state=RND_SEED)
    bc.fit(X, Y)
    assert_raises(RuntimeError, bc.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Example #16
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    bc = BalanceCascade(random_state=RND_SEED)
    bc.fit(X, Y)
    assert_raises(RuntimeError, bc.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_rf_wth_bootstrap():
    """Test the fit and sample routine with auto ratio with a random
    forest."""

    # Define the ratio parameter
    ratio = 'auto'
    classifier = RandomForestClassifier(random_state=RND_SEED)

    # Create the sampling object
    bc = BalanceCascade(
        ratio=ratio,
        random_state=RND_SEED,
        return_indices=True,
        estimator=classifier,
        bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array(
        [
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]),
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.77481731, 0.60935141],
                      [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                      [0.9281014, 0.53085498]])
        ],
        dtype=object)
    y_gt = np.array(
        [
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        ],
        dtype=object)
    idx_gt = np.array(
        [
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5,
                      13]),
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15])
        ],
        dtype=object)

    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Example #18
0
def test_give_classifier_wrong_obj():
    sampling_strategy = 'auto'
    classifier = 2
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=True,
                        estimator=classifier)
    with raises(ValueError, match="Invalid parameter `estimator`"):
        bc.fit_resample(X, Y)
def test_rf_wth_bootstrap():
    # Define the ratio parameter
    ratio = 'auto'
    classifier = RandomForestClassifier(random_state=RND_SEED)

    # Create the sampling object
    bc = BalanceCascade(
        ratio=ratio,
        random_state=RND_SEED,
        return_indices=True,
        estimator=classifier,
        bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array(
        [
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]),
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.77481731, 0.60935141],
                      [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                      [0.9281014, 0.53085498]])
        ],
        dtype=object)
    y_gt = np.array(
        [
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        ],
        dtype=object)
    idx_gt = np.array(
        [
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5,
                      13]),
            np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15])
        ],
        dtype=object)

    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Example #20
0
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 20)
    bc = BalanceCascade(random_state=RND_SEED)
    assert_warns(UserWarning, bc.fit, X, y)

    # multiclass case
    y = np.array([0] * 3 + [1] * 2 + [2] * 15)
    bc = BalanceCascade(random_state=RND_SEED)
    assert_warns(UserWarning, bc.fit, X, y)
def test_fit_sample_auto_early_stop_2():
    """Test the fit and sample routine with auto ratio with a 2 subsets."""

    # Define the ratio parameter
    ratio = 'auto'
    n_subset = 2

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio,
                        random_state=RND_SEED,
                        return_indices=True,
                        n_max_subset=n_subset)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    X_gt = np.array([
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                  [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                  [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                  [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]),
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.70472253, -0.73309052],
                  [-0.18410027, -0.45194484], [0.77481731, 0.60935141],
                  [0.3084254, 0.33299982], [0.28893132, -0.38761769],
                  [0.9281014, 0.53085498]])
    ],
                    dtype=object)
    y_gt = np.array([
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
    ],
                    dtype=object)
    idx_gt = np.array([
        np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 11, 4, 10, 2, 8, 1, 7]),
        np.array([0, 2, 3, 4, 11, 12, 17, 19, 6, 4, 8, 0, 3, 5, 9])
    ],
                      dtype=object)

    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Example #22
0
def unbalanceProcess(params, X_train, y_train):
    pos_num = np.sum(y_train == 0)
    neg_num = y_train.shape[0] - pos_num
    ratio = {0: int(pos_num * 0.2),
             1: int(neg_num * 1)}
    y_train = y_train.astype("int")
    sm = BalanceCascade(sampling_strategy=ratio,# replacement=True,
                        random_state=params['random-state'], n_max_subset=10,
                        estimator=LogisticRegression(solver='sag', max_iter=200, random_state=0))

    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    X_train_res = X_train_res[0];
    y_train_res = y_train_res[0]

    return X_train_res, y_train_res
Example #23
0
def test_give_classifier_wrong_obj():
    ratio = 'auto'
    classifier = 2
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True, estimator=classifier)
    assert_raises_regex(ValueError, "Invalid parameter `estimator`",
                        bc.fit_sample, X, Y)
Example #24
0
def test_bc_fit():
    """Test the fitting method"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    bc.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(bc.min_c_, 0)
    assert_equal(bc.maj_c_, 1)
    assert_equal(bc.stats_c_[0], 8)
    assert_equal(bc.stats_c_[1], 12)
def test_bc_fit():
    """Test the fitting method"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    bc.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(bc.min_c_, 0)
    assert_equal(bc.maj_c_, 1)
    assert_equal(bc.stats_c_[0], 500)
    assert_equal(bc.stats_c_[1], 4500)
Example #26
0
def test_init_wrong_classifier():
    """Test either if an error is raised the classifier provided is unknown."""

    # Define the ratio parameter
    classifier = 'rnd'

    bc = BalanceCascade(classifier=classifier)
    assert_raises(NotImplementedError, bc.fit_sample, X, Y)
Example #27
0
def cross_validation_ensenble(name):
    with open('../data/conv_pred/train_data2_' + name + '.pickle', 'rb') as f:
        data = pickle.load(f)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])

    cv = 5
    kf = KFold(n_splits=cv)
    fscore = 0
    ftscore = 0
    all_f_value = 0
    for train_index, test_index in tqdm(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ensenble_num = 1

        bc = BalanceCascade(estimator=LogisticRegression(),
                            n_max_subset=ensenble_num)
        bc_x, bc_y = bc.fit_sample(X_train, y_train)
        models = []
        predicts = []
        final_result = []
        models.append(xgb.XGBClassifier(n_estimators=500, max_delta_step=1))
        for i in range(ensenble_num):
            models[i].fit(bc_x[i], bc_y[i])
        for i in range(ensenble_num):
            predicts.append(models[i].predict_proba(X_test))
        for i in range(len(predicts[0])):
            result = [0, 0]
            for j in range(ensenble_num):
                result[0] += predicts[j][i][0] / ensenble_num
                result[1] += predicts[j][i][1] / ensenble_num
            final_result.append(result)
        precision, recall, f_value, _ = eval(y_test, final_result)
        fscore += precision
        ftscore += recall
        all_f_value += f_value
    # pprint(sorted(
    #     zip(np.mean([est.steps[1][1].feature_importances_ for est in model.estimators_], axis=0), v.feature_names_),
    #     key=lambda x: x[0], reverse=True))
    print('\n')
    print('final precision : ', str(fscore / cv))
    print('final recall : ', str(ftscore / cv))
    print('final f-value : ', str(all_f_value / cv))
def test_fit_sample_auto():
    ratio = 'auto'
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True)
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463],
                      [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342],
                      [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981],
                      [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]],
                     [[0.28893132, -0.38761769],
                      [0.83680821, 1.72827342],
                      [0.3084254, 0.33299982],
                      [0.70472253, -0.73309052],
                      [-0.14374509, 0.27370049],
                      [0.77481731, 0.60935141],
                      [-0.18410027, -0.45194484],
                      [1.15514042, 0.0129463],
                      [0.11622591, -0.0317206],
                      [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976],
                      [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207],
                      [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653],
                      [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    idx_gt = np.array(
        [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19],
         [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Example #29
0
def test_bc_fit_invalid_ratio():
    """Test either if an error is raised when the balancing ratio to fit is
    smaller than the one of the data"""

    # Create the object
    ratio = 1. / 10000.
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    assert_raises(RuntimeError, bc.fit_sample, X, Y)
def test_bc_init():
    # Define a ratio
    ratio = 1.
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)

    assert_equal(bc.ratio, ratio)
    assert_equal(bc.bootstrap, True)
    assert_equal(bc.n_max_subset, None)
    assert_equal(bc.random_state, RND_SEED)
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.8

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled = bc.fit_sample(X, Y)

    X_gt = np.array(
        [
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.9281014, 0.53085498], [0.3084254, 0.33299982]]),
            np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                      [1.15514042, 0.0129463], [0.70472253, -0.73309052],
                      [-0.18410027, -0.45194484], [0.77481731, 0.60935141],
                      [0.28893132, -0.38761769]])
        ],
        dtype=object)

    y_gt = np.array(
        [
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
            np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
        ],
        dtype=object)
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
Example #32
0
def test_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    assert_raises(RuntimeError, bc.sample, X, Y)
Example #33
0
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.5

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)

    # Get the different subset
    X_resampled, y_resampled = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y_05.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
Example #34
0
def smot(train_x, train_y, feature_columns):
    from imblearn.ensemble import BalanceCascade
    from sklearn.ensemble import RandomForestClassifier

    #sm = RandomOverSampler(ratio='majority')
    #from imblearn.ensemble import BalanceCascade

    sm = BalanceCascade(random_state=42, classifier=RandomForestClassifier())

    print('Détail du nombre par CLASSE  Y {}'.format(Counter(train_y)))
    X_res, y_res = sm.fit_sample(train_x, train_y)

    my_list = map(lambda x: x[0], y_res)
    train_y = pd.Series(my_list)
    print(' Détail du nombre par CLASSE Y  {}'.format(Counter(train_y)))

    # reconstitution DATAFRAME
    train_x = pd.DataFrame(X_res, columns=feature_columns)

    return train_x, train_y
Example #35
0
def test_fit_resample_auto_early_stop():
    sampling_strategy = 'auto'
    estimator = LinearSVC(random_state=RND_SEED)
    bc = BalanceCascade(sampling_strategy=sampling_strategy,
                        random_state=RND_SEED,
                        return_indices=False,
                        estimator=estimator,
                        n_max_subset=1)
    X_resampled, y_resampled = bc.fit_resample(X, Y)
    X_gt = np.array([[[1.15514042, 0.0129463], [0.08711622, 0.93259929],
                      [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                      [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                      [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                      [0.11622591, -0.0317206], [1.25192108, -0.22367336],
                      [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                      [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                      [-0.41635887, -0.38299653], [1.70580611, -0.11219234]]])
    y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Example #36
0
def test_bc_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ratio = 1.
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)

    assert_equal(bc.ratio, ratio)
    assert_equal(bc.bootstrap, True)
    assert_equal(bc.n_max_subset, None)
    assert_equal(bc.random_state, RND_SEED)
Example #37
0
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.8

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False)

    # Get the different subset
    X_resampled, y_resampled = bc.fit_sample(X, Y)

    X_gt = np.array([
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.08711622, 0.93259929],
                  [0.70472253, -0.73309052], [-0.14374509, 0.27370049],
                  [0.83680821, 1.72827342], [-0.18410027, -0.45194484],
                  [-0.28162401, -2.10400981], [-1.11515198, -0.93689695],
                  [0.9281014, 0.53085498], [0.3084254, 0.33299982]]),
        np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336],
                  [0.53366841, -0.30312976], [1.52091956, -0.49283504],
                  [0.88407872, 0.35454207], [1.31301027, -0.92648734],
                  [-0.41635887, -0.38299653], [1.70580611, -0.11219234],
                  [1.15514042, 0.0129463], [0.70472253, -0.73309052],
                  [-0.18410027, -0.45194484], [0.77481731, 0.60935141],
                  [0.28893132, -0.38761769]])
    ],
                    dtype=object)

    y_gt = np.array([
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
        np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    ],
                    dtype=object)
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
Example #38
0
def test_bc_fit_single_class():
    """Test either if an error when there is a single class"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_warns(UserWarning, bc.fit, X, y_single_class)
def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    bc = BalanceCascade(ratio=ratio, random_state=RND_SEED,
                        return_indices=True)

    # Get the different subset
    X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'bc_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'bc_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'bc_idx.npy'))
    # Check each array
    for idx in range(X_gt.size):
        assert_array_equal(X_resampled[idx], X_gt[idx])
        assert_array_equal(y_resampled[idx], y_gt[idx])
        assert_array_equal(idx_under[idx], idx_gt[idx])
Example #40
0
def test_bc_bad_ratio():
    """Test either if an error is raised with a wrong decimal value for
    the ratio"""

    # Define a negative ratio
    ratio = -1.0
    bc = BalanceCascade(ratio=ratio)
    assert_raises(ValueError, bc.fit, X, Y)

    # Define a ratio greater than 1
    ratio = 100.0
    bc = BalanceCascade(ratio=ratio)
    assert_raises(ValueError, bc.fit, X, Y)

    # Define ratio as an unknown string
    ratio = 'rnd'
    bc = BalanceCascade(ratio=ratio)
    assert_raises(ValueError, bc.fit, X, Y)

    # Define ratio as a list which is not supported
    ratio = [.5, .5]
    bc = BalanceCascade(ratio=ratio)
    assert_raises(ValueError, bc.fit, X, Y)
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_resample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],