Ejemplo n.º 1
0
def test_fit_returns_none():
    train_df = pd.DataFrame({'A': [0., 1.], 'B': [0., 0.]})

    variance_filter = VarianceFilter()
    return_value = variance_filter.fit(train_df)

    assert return_value is None
Ejemplo n.º 2
0
def test_fit_transform_continuous():
    train_df = pd.DataFrame({'A': [0., 1.], 'B': [0., 0.]})

    variance_filter = VarianceFilter()
    train_df = variance_filter.fit_transform(train_df)

    assert train_df.equals(pd.DataFrame({'A': [0., 1.]}))
Ejemplo n.º 3
0
def test_fit_sets_correct_columns_to_drop():
    train_df = pd.DataFrame({'A': [0., 1.], 'B': [0., 0.]})

    variance_filter = VarianceFilter()
    variance_filter.fit(train_df)

    assert variance_filter.columns_to_drop == ['B']
Ejemplo n.º 4
0
def test_transform():
    test_df = pd.DataFrame({'A': [0., 0.], 'B': [0., 1.]})

    variance_filter = VarianceFilter()
    variance_filter.columns_to_drop = ['B']
    test_df = variance_filter.transform(test_df)

    assert test_df.equals(pd.DataFrame({'A': [0., 0.]}))
Ejemplo n.º 5
0
def test_sparse_data():
    # Create a sparse DataFrame with a correlation of -1
    sdf = pd.DataFrame({'A': [0] * 20,
                        'B': [0] * 10 + [1] * 10}).to_sparse()

    variance_filter = VarianceFilter()
    variance_filter.fit(sdf)

    assert variance_filter.columns_to_drop == ['A']
Ejemplo n.º 6
0
def test_remove_min_variance_for_single_valued_variables():
    "Make sure it does not crash for variables with only one value"
    train_df = pd.DataFrame({'A': ['a'] * 100})

    variance_filter = VarianceFilter()
    train_df = variance_filter.fit_transform(train_df)

    # Make sure column 'B' is dropped for both train and test set
    # Also, column 'A' must not be dropped for the test set even though its
    # variance in the test set is below the threshold
    assert np.array_equal(train_df.values, np.empty((100, 0)))
Ejemplo n.º 7
0
def test_remove_min_variance_for_categorical():
    train_df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'a', 'a']})
    test_df = pd.DataFrame({'A': ['a', 'a', 'b'], 'B': ['a', 'b', 'c']})

    variance_filter = VarianceFilter(unique_cut=50)
    train_df = variance_filter.fit_transform(train_df)
    test_df = variance_filter.transform(test_df)

    # Make sure column 'B' is dropped for both train and test set
    # Also, column 'A' must not be dropped for the test set even though its
    # variance in the test set is below the threshold
    assert train_df.equals(pd.DataFrame({'A': ['a', 'b', 'c']}))
    assert test_df.equals(pd.DataFrame({'A': ['a', 'a', 'b']}))
Ejemplo n.º 8
0
def test_sample_ratio():
    train_df = pd.DataFrame({'A': [0, 0, 1]})

    # Set seed to consider a sample of [0, 1]
    variance_filter_1 = VarianceFilter(sample_ratio=0.7, seed=1)
    variance_filter_1.fit(train_df)
    # Set seed to consider a sample of [1, 1]
    variance_filter_2 = VarianceFilter(sample_ratio=0.7, seed=3)
    variance_filter_2.fit(train_df)

    assert variance_filter_1.columns_to_drop == []
    assert variance_filter_2.columns_to_drop == ['A']
def _build_test_pipeline():
    pipeline = Pipeline([
        ('na_filter', NaFilter(max_na_ratio=0.5)),
        ('variance_filter', VarianceFilter())
    ])
    return pipeline