Example #1
0
def test_classification_imbalanced_multiple_multiclass():
    X = pd.DataFrame({"a": [i for i in range(10000)]})
    y = pd.Series([0] * 4900 + [1] * 4900 +
                  [2] * 200)  # minority class is 2% of data
    bcs = BalancedClassificationSampler(min_samples=201)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    # severe imbalanace case, don't resample
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)

    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 1800
    assert all(y2.value_counts().values == [800, 800, 200])
    assert y2.value_counts()[2] == 200

    bcs = BalancedClassificationSampler(balanced_ratio=3)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    # resample to 4:1 ratios on both 0 and 1 classes
    assert len(X2) == 1400
    assert all(y2.value_counts().values == [600, 600, 200])
    assert y2.value_counts()[2] == 200
Example #2
0
def test_balanced_classification_errors():
    with pytest.raises(ValueError, match="balanced_ratio must be"):
        BalancedClassificationSampler(balanced_ratio=-1)

    with pytest.raises(ValueError, match="min_sample must be"):
        BalancedClassificationSampler(min_samples=0)

    with pytest.raises(ValueError, match="min_percentage must be"):
        BalancedClassificationSampler(min_percentage=0)

    with pytest.raises(ValueError, match="min_percentage must be"):
        BalancedClassificationSampler(min_percentage=0.6)

    with pytest.raises(ValueError, match="min_percentage must be"):
        BalancedClassificationSampler(min_percentage=-1.3)
Example #3
0
def test_data_splitters_data_type(data_type, balanced_splitter, data_splitter,
                                  make_data_type, X_y_binary):
    X, y = X_y_binary
    # make imbalanced
    X_extended = np.append(X, X, 0)
    y_extended = np.append(y, np.array([0] * len(y)), 0)
    sample_method = BalancedClassificationSampler(sampling_ratio=1,
                                                  min_samples=50,
                                                  random_seed=0)

    initial_results = []
    for i, (train_indices, test_indices) in enumerate(
            data_splitter.split(X_extended, y_extended)):
        new_train_indices = sample_method.fit_resample(
            X_extended[train_indices], y_extended[train_indices])
        initial_results.append([new_train_indices, test_indices])
    indices = sample_method.fit_resample(X_extended, y_extended)

    X_extended = make_data_type(data_type, X_extended)
    y_extended = make_data_type(data_type, y_extended)
    for i, (train, test) in enumerate(
            balanced_splitter.split(X_extended, y_extended)):  # for each split
        assert len(train) == len(initial_results[i][0])
        assert len(test) == len(initial_results[i][1])
        assert len(train) + len(test) < len(X_extended)

    final_indices = balanced_splitter.transform_sample(X_extended, y_extended)
    assert len(final_indices) == len(indices)
    assert len(final_indices) <= 50 * 2
    assert isinstance(final_indices, list)
Example #4
0
def test_classification_data_frame_dtypes():
    X = pd.DataFrame({
        "integers": [i for i in range(1000)],
        "strings": [f"string_{i % 3}" for i in range(1000)],
        "text": [
            f"this should be text data because {i} think it's a long string. Let's hope it behaves in that way"
            for i in range(1000)
        ],
        "float": [i / 10000 for i in range(1000)],
        "bool": [bool(i % 2) for i in range(1000)],
        "datetime": [
            random.choice([2012 / 1 / 2, 2012 / 2 / 1, 2012 / 4 / 2])
            for i in range(1000)
        ]
    })
    y = pd.Series([0] * 900 + [1] * 100)
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 500
    assert all(y2.value_counts().values == [400, 100])
    assert y2.value_counts()[1] == 100

    X['integers'][0] = None
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 500
    assert all(y2.value_counts().values == [400, 100])
    assert y2.value_counts()[1] == 100
Example #5
0
def test_classification_imbalanced_balanced_ratio(num_classes, balanced_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if num_classes == 2:
        y = pd.Series([0] * 750 + [1] * 250)
    else:
        y = pd.Series([0] * 600 + [1] * 200 + [2] * 200)
    bcs = BalancedClassificationSampler(balanced_ratio=balanced_ratio)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if balanced_ratio >= 3:
        # the classes are considered balanced, do nothing
        pd.testing.assert_frame_equal(X, X2)
        pd.testing.assert_series_equal(y, y2)
    else:
        # remove some samples
        assert len(X2) == {
            2: (250 * (balanced_ratio + 1)),
            3: (200 * (balanced_ratio + 2))
        }[num_classes]
        assert len(y2) == len(X2)
        assert y2.value_counts().values[0] == balanced_ratio * {
            2: 250,
            3: 200
        }[num_classes]
def test_balance_ratio_value():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 200 + [1] * 800)
    bcs = BalancedClassificationSampler(sampling_ratio=0.1)
    indices = bcs.fit_resample(X, y)
    # make sure there was no resampling done
    assert len(indices) == 1000
def test_data_splitters_dataset(dataset, balanced_splitter, data_splitter,
                                make_data_type, X_y_binary, X_y_multi):
    if dataset == 'binary':
        X, y = X_y_binary
    else:
        X, y = X_y_multi
    dataset = 0 if dataset == 'binary' else 1
    # make imbalanced
    X_extended = np.append(X, X, 0)
    y_extended = np.append(y, np.array([0] * len(y)), 0)
    sample_method = BalancedClassificationSampler(balanced_ratio=1,
                                                  min_samples=50,
                                                  random_seed=0)

    initial_results = []
    for i, (train_indices, test_indices) in enumerate(
            data_splitter.split(X_extended, y_extended)):
        new_train_indices = sample_method.fit_resample(
            X_extended[train_indices], y_extended[train_indices])
        initial_results.append([new_train_indices, test_indices])
    indices = sample_method.fit_resample(X_extended, y_extended)

    # change to woodwork
    X_extended = make_data_type("ww", X_extended)
    y_extended = make_data_type("ww", y_extended)
    for i, (train, test) in enumerate(
            balanced_splitter.split(X_extended, y_extended)):  # for each split
        assert len(train) == len(initial_results[i][0])
        assert len(test) == len(initial_results[i][1])
        assert len(train) + len(test) < len(X_extended)

    final_indices = balanced_splitter.transform_sample(X_extended, y_extended)
    assert len(final_indices) == len(indices)
    assert len(final_indices) <= 50 * (dataset + 2)
    assert isinstance(final_indices, list)
Example #8
0
def test_classification_balanced_simple(num_classes):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([i % num_classes for i in range(1000)])
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)
Example #9
0
def test_balanced_classification_init(ratio, samples, percentage, seed):
    bcs = BalancedClassificationSampler(balanced_ratio=ratio,
                                        min_samples=samples,
                                        min_percentage=percentage,
                                        random_seed=seed)
    assert bcs.balanced_ratio == ratio
    assert bcs.min_samples == samples
    assert bcs.min_percentage == percentage
    assert bcs.random_seed == seed
def test_sampler_ratio_dictionary_multiclass(sampling_ratio_dict, expected):
    X = pd.DataFrame({"a": [i for i in range(1200)]})
    y = pd.Series([0] * 200 + [1] * 800 + [2] * 200)
    bcs = BalancedClassificationSampler(
        sampling_ratio_dict=sampling_ratio_dict)
    indices = bcs.fit_resample(X, y)
    y_new = y.iloc[indices]
    y_sampled_count = y_new.value_counts().to_dict()
    assert y_sampled_count == expected
Example #11
0
def test_classification_severely_imbalanced_multiclass_simple():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    # 9 instances of 1, 9 instances of 2
    y = pd.Series([0 if i % 55 != 0 else (1 + i % 2) for i in range(1000)])
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)
def test_dict_overrides_ratio():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 200 + [1] * 800)
    sampling_ratio_dict = {0: 200, 1: 800}
    bcs = BalancedClassificationSampler(
        sampling_ratio=0.1, sampling_ratio_dict=sampling_ratio_dict)
    indices = bcs.fit_resample(X, y)
    y_new = y.iloc[indices]
    y_sampled_count = y_new.value_counts().to_dict()
    assert y_sampled_count == sampling_ratio_dict
Example #13
0
def test_classification_severely_imbalanced_binary_simple():
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    # 5 instances of positive 1
    y = pd.Series([1 if i % 200 != 0 else 0 for i in range(1000)])
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    pd.testing.assert_frame_equal(X, X2)
    pd.testing.assert_series_equal(y, y2)
def test_classification_imbalanced_custom_indices(index):
    X = pd.DataFrame({"a": [i for i in range(1000)]}, index=index)
    y = pd.Series([0] * 900 + [1] * 100, index=index)
    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 500
    assert all(y2.value_counts(0).values == [400, 100])
    assert all(y2.index.values == X2.index.values)
    assert len(set(y2.index.values).intersection(set(y.index.values))) == len(y2)
def test_classification_imbalanced_random_seed(random_seed, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 800 + [1] * 200)
    bcs1 = BalancedClassificationSampler(sampling_ratio=sampling_ratio, random_seed=random_seed)
    bcs2 = BalancedClassificationSampler(sampling_ratio=sampling_ratio, random_seed=random_seed)
    indices1 = bcs1.fit_resample(X, y)
    X1 = X.loc[indices1]
    y1 = y.loc[indices1]
    indices2 = bcs2.fit_resample(X, y)
    X2 = X.loc[indices2]
    y2 = y.loc[indices2]

    if sampling_ratio <= 0.25:
        # data is balanced
        pd.testing.assert_frame_equal(X1, X)
    else:
        assert len(X2) == 200 + int(200 / sampling_ratio)
        assert y2.value_counts().values[0] == int(200 / sampling_ratio)
    pd.testing.assert_frame_equal(X1, X2)
    pd.testing.assert_series_equal(y1, y2)
Example #16
0
def test_classification_data_drop():
    # tests for whether or not the `max(0, counts[k] - goal_value)` code works as expected
    X = pd.DataFrame({"a": [i for i in range(420)]})
    y = pd.Series([0] * 90 + [1] * 100 + [2] * 120 + [3] * 40 + [4] * 70)
    # will downsample the [2] target
    # will try to downsample [0] and [4], but max(0, x) will prevent that
    bcs = BalancedClassificationSampler(balanced_ratio=1, min_percentage=0.01)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    assert len(X2) == 400
    assert y2.value_counts().values[0] == 100
Example #17
0
def test_classification_imbalanced_data_type(data_type, make_data_type):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 900 + [1] * 100)
    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    bcs = BalancedClassificationSampler()
    indices = bcs.fit_resample(X, y)
    assert len(indices) == 500
    if data_type in ['pd', 'np']:
        y2 = y.loc[indices]
        assert all(y2.value_counts().values == [400, 100])
        assert y2.value_counts()[1] == 100
def test_classification_imbalanced_severe_imbalance_binary(min_samples, min_percentage):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 850 + [1] * 150)  # minority class is 15% of total distribution
    bcs = BalancedClassificationSampler(sampling_ratio=0.5, min_samples=min_samples, min_percentage=min_percentage)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_samples >= 200 and min_percentage >= 0.2:
        # severe imbalance, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # does not classify as severe imbalance, so balance 2:1 with min_samples
        assert len(X2) == 150 + max(min_samples, 2 * 150)
        assert y2.value_counts().values[0] == max(min_samples, 2 * 150)
Example #19
0
def test_classification_imbalanced_small_dataset(size):
    X = pd.DataFrame({"a": [i for i in range(size)]})
    y = pd.Series([0] * int(0.8 * size) + [1] * int(0.2 * size))
    bcs = BalancedClassificationSampler(balanced_ratio=1)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if size == 100:
        pd.testing.assert_frame_equal(X2, X)
    else:
        assert len(X2) == 0.2 * size + 100

    bcs2 = BalancedClassificationSampler(balanced_ratio=1, min_samples=40)
    indices = bcs2.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if size == 500:
        # resulting majority size is 100
        assert len(X2) == 200
        assert y2.value_counts(normalize=True).values[0] == 0.5
    else:
        assert len(X2) == 0.2 * size + 40
        assert y2.value_counts().values[0] == 40
def test_classification_imbalanced_normal_imbalance_binary(min_samples, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    y = pd.Series([0] * 850 + [1] * 150)  # minority class is 15% of total distribution, never counts as severe imbalance
    bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio, min_samples=min_samples)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if sampling_ratio < 0.2:
        # data is balanced, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # rebalance according to the ratio and min_samples
        assert len(X2) == 150 + max(min_samples, int(150 / sampling_ratio))
        assert y2.value_counts().values[0] == max(min_samples, int(150 / sampling_ratio))
def test_classification_imbalanced_min_percentage(num_classes, min_percentage):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if num_classes == 2:
        y = pd.Series([0] * 950 + [1] * 50)
    else:
        y = pd.Series([0] * 820 + [1] * 90 + [2] * 90)
    bcs = BalancedClassificationSampler(sampling_ratio=1, min_percentage=min_percentage)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_percentage <= 0.05:
        # does not classify as severe imbalance, so balance 1:1 with min_samples==100
        assert len(X2) == {2: 150, 3: 280}[num_classes]
        assert y2.value_counts().values[0] == 100
    else:
        # severe imbalance, do nothing
        pd.testing.assert_frame_equal(X2, X)
def test_classification_imbalanced_normal_imbalance_multiclass(data_type, min_samples, sampling_ratio):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if data_type == 'n':
        y = pd.Series([0] * 800 + [1] * 100 + [2] * 100)  # minority class is 10% of total distribution
    else:
        y = pd.Series(["class_1"] * 800 + ["class_2"] * 100 + ["class_3"] * 100)
    bcs = BalancedClassificationSampler(sampling_ratio=sampling_ratio, min_samples=min_samples)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if sampling_ratio < 0.2:
        # data is balanced, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # rebalance according to the ratio and min_samples
        assert len(X2) == 200 + max(min_samples, int(100 / sampling_ratio))
        assert y2.value_counts().values[0] == max(min_samples, int(100 / sampling_ratio))
def test_classification_imbalanced_min_samples(num_classes, min_samples):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if num_classes == 2:
        y = pd.Series([0] * 900 + [1] * 100)
    else:
        y = pd.Series([0] * 799 + [1] * 101 + [2] * 100)
    bcs = BalancedClassificationSampler(sampling_ratio=1, min_samples=min_samples)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_samples <= 100:
        # balance 1:1 without conflicting with min_samples
        assert len(X2) == {2: 200, 3: 300}[num_classes]
        assert y2.value_counts().values[0] == 100
    else:
        # cannot balance 1:1, choosing the min_samples size for the majority class and add minority class(es)
        if num_classes == 2:
            assert len(X2) == min_samples + 100
            assert y2.value_counts().values[0] == min_samples
        else:
            assert len(X2) == min_samples + 201
            assert y2.value_counts().values[0] == min_samples
Example #24
0
def test_classification_imbalanced_severe_imbalance_multiclass(
        data_type, min_samples, min_percentage):
    X = pd.DataFrame({"a": [i for i in range(1000)]})
    if data_type == 'n':
        y = pd.Series([0] * 800 + [1] * 100 +
                      [2] * 100)  # minority class is 10% of total distribution
    else:
        y = pd.Series(["class_1"] * 800 + ["class_2"] * 100 +
                      ["class_3"] * 100)
    bcs = BalancedClassificationSampler(balanced_ratio=2,
                                        min_samples=min_samples,
                                        min_percentage=min_percentage)
    indices = bcs.fit_resample(X, y)
    X2 = X.loc[indices]
    y2 = y.loc[indices]
    if min_samples >= 200 and min_percentage >= 0.2:
        # severe imbalance, do nothing
        pd.testing.assert_frame_equal(X2, X)
    else:
        # does not classify as severe imbalance, so balance 2:1 with min_samples
        assert len(X2) == 200 + max(min_samples, 2 * 100)
        assert y2.value_counts().values[0] == max(min_samples, 2 * 100)