Beispiel #1
0
def test_data_splitters_imbalanced_binary_tv():
    X = pd.DataFrame({
        "a": [i for i in range(1000)],
        "b": [i % 5 for i in range(1000)]
    })
    # make y a 9:1 class ratio
    y = pd.Series([0] * 100 + [1] * 900)
    splitter = BalancedClassificationDataTVSplit()

    for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)):
        assert len(test_indices) == 250  # test_size defaults to 0.25
        # remaining data will still preserve 9:1 ratio, which we want to get to 4:1
        # we don't know the exact number since we don't stratify split
        assert len(train_indices) < 500
        # we can only test the balance of the train since the split isn't stratified
        y_balanced_train = y.iloc[train_indices]
        y_train_counts = y_balanced_train.value_counts(normalize=True)
        assert max(y_train_counts.values) == 0.8
Beispiel #2
0
def test_data_splitters_imbalanced_multiclass_tv():
    X = pd.DataFrame({
        "a": [i for i in range(1500)],
        "b": [i % 5 for i in range(1500)]
    })
    # make y a 8:1:1 class ratio
    y = pd.Series([0] * 150 + [1] * 1200 + [2] * 150)
    splitter = BalancedClassificationDataTVSplit()

    for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)):
        assert len(test_indices) == 375  # test_size defaults to 0.25
        # we don't know the exact number since we don't stratify split
        assert len(train_indices) < 1000
        # we can only test the balance of the train since the split isn't stratified
        y_balanced_train = y.iloc[train_indices]
        y_train_counts = y_balanced_train.value_counts(normalize=True)
        # assert the values are around 2/3 for the majority class
        assert max(y_train_counts.values) < 7 / 10
        assert max(y_train_counts.values) > 6 / 10