Ejemplo n.º 1
0
def test_fit_returns_none():
    train_df = ks.DataFrame({'A': [0, np.nan, np.nan], 'B': [0, 0, np.nan]})

    na_filter = NaFilter()
    return_value = na_filter.fit(train_df)

    assert return_value is None
Ejemplo n.º 2
0
def test_fit_sets_correct_columns_to_drop():
    train_df = ks.DataFrame({'A': [0, np.nan, np.nan], 'B': [0, 0, np.nan]})

    na_filter = NaFilter(max_na_ratio=0.5)
    na_filter.fit(train_df)

    assert na_filter.columns_to_drop == ['A']
Ejemplo n.º 3
0
def test_transform():
    test_df = ks.DataFrame({'A': [0, np.nan, np.nan], 'B': [0, 0, np.nan]})

    na_filter = NaFilter(max_na_ratio=0.5)
    na_filter.columns_to_drop = ['A']
    test_df = na_filter.transform(test_df)

    assert repr(test_df) == repr(ks.DataFrame({'B': [0, 0, np.nan]}))
Ejemplo n.º 4
0
def test_sparse_data():
    sdf = pd.DataFrame({
        'A': [0] * 10 + [np.nan] * 20,
        'B': [0] * 20 + [np.nan] * 10
    }).to_sparse()

    na_filter = NaFilter(max_na_ratio=0.5)
    na_filter.fit(sdf)

    assert na_filter.columns_to_drop == ['A']
Ejemplo n.º 5
0
def test_sample_ratio():
    train_df = pd.DataFrame({'A': [0, np.nan, np.nan]})

    # Use seed to get values [0, NaN] resulting in an na_ratio of 0.5
    na_filter_1 = NaFilter(max_na_ratio=0.5, sample_ratio=0.5, seed=1)
    na_filter_1.fit(train_df)
    # Use seed to get values [NaN, NaN] resulting in an na_ratio of 1.0
    na_filter_2 = NaFilter(max_na_ratio=0.5, sample_ratio=0.5, seed=2)
    na_filter_2.fit(train_df)

    assert na_filter_1.columns_to_drop == []
    assert na_filter_2.columns_to_drop == ['A']
def _build_test_pipeline():
    pipeline = Pipeline([
        ('na_filter', NaFilter(max_na_ratio=0.5)),
        ('variance_filter', VarianceFilter())
    ])
    return pipeline