Example #1
0
def test_duplicate_columns():
    X = pd.DataFrame([[0, 1]], columns=['a', 'a'])
    with pytest.raises(ValueError, match="Duplicate Columns"):
        clean(X)

    with pytest.raises(ValueError, match="Duplicate Columns"):
        detect_types(X)
Example #2
0
def test_dirty_float_target_regression():
    titanic_data = load_titanic()
    data = pd.DataFrame({'one': np.repeat(np.arange(50), 2)})
    dirty = make_dirty_float()
    data['target'] = dirty
    with pytest.warns(UserWarning, match="Discarding dirty_float targets that "
                                         "cannot be converted to float."):
        clean(data, target_col="target")
    with pytest.warns(UserWarning, match="Discarding dirty_float targets that "
                                         "cannot be converted to float."):
        plot(data, target_col="target")

    # check if works for non dirty_float targets
    plot(titanic_data, 'survived')
Example #3
0
def test_dirty_float_single_warning():
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')

        rng = np.random.RandomState(0)
        cont_clean = ["{:2.2f}".format(x) for x in rng.uniform(size=100)]

        dirty3 = pd.Series(cont_clean)
        dirty3[::20] = [("missing", "but weird")] * 5

        X = pd.DataFrame({'dirty3': dirty3})
        clean(X)

        assert len(w) == 1
Example #4
0
def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous',
        'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3',
        'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3',
        'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2',
        'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0',
        'body_?_1.0'
    ]
    assert ep.get_feature_names() == expected_names

    # without clean
    X = ep.fit_transform(titanic.drop('survived', axis=1))
    # FIXME can't do that yet
    # assert ep.get_feature_names() == expected_names_no_clean

    assert not np.isnan(X).any()
Example #5
0
def test_continuous_castable():
    X = pd.DataFrame({'a': [1, 2, 3, '1', 2, 3, '1.1']})
    types = detect_types(X)
    assert types.continuous['a']

    X_new = clean(X)
    assert X_new.dtypes['a'] == np.float64
Example #6
0
def test_duplicate_index():
    X = X_cat.copy()
    X.index = np.ones(len(X), np.int)
    assert not X.index.is_unique
    with pytest.raises(ValueError):
        detect_types(X)
    with pytest.warns(UserWarning):
        X_clean = clean(X)
    assert X_clean.index.is_unique
Example #7
0
def test_easy_preprocessor_transform():
    titanic = load_titanic()
    titanic_clean = clean(titanic)
    X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y,
                                                      random_state=42)
    pipe = make_pipeline(EasyPreprocessor(), LogisticRegression(C=0.1))
    pipe.fit(X_train, y_train)
    pipe.predict(X_train)
    pipe.predict(X_val)
Example #8
0
def test_digits_type_hints():
    data_bunch = load_digits()

    try:
        feature_names = data_bunch.feature_names
    except AttributeError:
        feature_names = ['x%d' % i for i in range(data_bunch.data.shape[1])]

    data = data_df_from_bunch(data_bunch)
    data_clean = clean(data, type_hints={
                       feature: 'continuous' for feature in feature_names})
    assert data_clean.shape[1] == 65
Example #9
0
def test_detect_string_floats():
    # test if we can find floats that are encoded as strings
    # sometimes they have weird missing values
    rng = np.random.RandomState(0)
    cont_clean = ["{:2.2f}".format(x) for x in rng.uniform(size=100)]
    dirty = pd.Series(cont_clean)
    # not strings, but actually numbers!
    dirty2 = pd.Series(rng.uniform(size=100))
    # FIXME this wouldn't work with using straight floats
    dirty3 = pd.Series(cont_clean)
    too_dirty = pd.Series(rng.uniform(size=100))

    # FIXME hardcoded frequency of tolerated missing
    # FIXME add test with integers
    # FIXME whitespace?
    dirty[::12] = "missing"
    dirty2[::12] = "missing"
    dirty3[::20] = [("missing", "but weird")] * 5
    too_dirty[::2] = rng.choice(list(string.ascii_letters), size=50)
    # only dirty:
    res = detect_types(pd.DataFrame(dirty))
    assert len(res) == 1
    assert res.dirty_float[0]

    # dirty and clean and weird stuff
    X = pd.DataFrame({
        'cont': cont_clean,
        'dirty': dirty,
        'dirty2': dirty2,
        'dirty3': dirty3,
        'too_dirty': too_dirty
    })
    res = detect_types(X)
    assert len(res) == 5
    assert res.continuous['cont']
    assert ~res.continuous['dirty']
    assert ~res.continuous['dirty2']
    assert ~res.continuous['dirty3']
    assert ~res.dirty_float['cont']
    assert res.dirty_float['dirty']
    assert res.dirty_float['dirty2']
    assert res.dirty_float['dirty3']
    assert ~res.dirty_float['too_dirty']
    assert ~res.free_string['dirty3']
    assert res.free_string['too_dirty']

    assert _float_matching(X.cont).all()
    is_float = X.dirty != 'missing'
    assert (_float_matching(X.dirty) == is_float).all()
    assert (_float_matching(X.dirty2) == is_float).all()
    assert (_float_matching(X.dirty3) == (X.dirty3.map(type) == str)).all()
    res = clean(X)
Example #10
0
def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'age_dabl_continuous', 'body_dabl_continuous', 'fare_dabl_continuous',
        'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0', 'pclass_1',
        'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D'
    ]
    assert ep.get_feature_names() == expected_names
Example #11
0
def test_type_hints(type_hints):
    X = pd.DataFrame({'a': [0, 1, 0, 1, 0],
                      'b': [0.1, 0.2, 0.3, 0.1, 0.1],
                      'c': ['a', 'b', 'a', 'b', 'a']})
    types = detect_types(X, type_hints=type_hints)
    X_clean = clean(X, type_hints=type_hints)

    # dropped a column:
    assert X_clean.shape[1] == 2

    for k, v in type_hints.items():
        # detect_types respects hints
        assert types.T.idxmax()[k] == v
        # conversion successful
        if v == 'continuous':
            assert X_clean[k].dtype == np.float
        elif v == 'categorical':
            assert X_clean[k].dtype == 'category'
Example #12
0
def test_plots_smoke(continuous_features, categorical_features, task):
    # simple smoke test
    # should be parametrized
    n_samples = 100
    X_cont, y_cont = make_regression(n_samples=n_samples,
                                     n_features=continuous_features,
                                     n_informative=min(continuous_features, 2))
    X_cat, y_cat = make_regression(n_samples=n_samples,
                                   n_features=categorical_features,
                                   n_informative=min(categorical_features, 2))
    if X_cat.shape[1] > 0:
        X_cat = KBinsDiscretizer(encode='ordinal').fit_transform(X_cat)
    cont_columns = ["asdf_%d_cont" % i for i in range(continuous_features)]
    df_cont = pd.DataFrame(X_cont, columns=cont_columns)
    if categorical_features > 0:
        cat_columns = ["asdf_%d_cat" % i for i in range(categorical_features)]
        df_cat = pd.DataFrame(X_cat, columns=cat_columns).astype('int')
        df_cat = df_cat.astype("category")
        X_df = pd.concat([df_cont, df_cat], axis=1)
    else:
        X_df = df_cont
    assert (X_df.shape[1] == continuous_features + categorical_features)
    X_clean = clean(X_df.copy())
    y = y_cont + y_cat
    if X_df.shape[1] == 0:
        y = np.random.uniform(size=n_samples)
    if task == "classification":
        y = np.digitize(y, np.percentile(y, [5, 10, 60, 85]))
    X_clean['target'] = y
    if task == "classification":
        X_clean['target'] = X_clean['target'].astype('category')
    types = detect_types(X_clean)
    column_types = types.T.idxmax()
    assert np.all(column_types[:continuous_features] == 'continuous')
    assert np.all(column_types[continuous_features:-1] == 'categorical')
    if task == "classification":
        assert column_types[-1] == 'categorical'
    else:
        assert column_types[-1] == 'continuous'

    plot(X_clean, target_col='target')
    plt.close("all")
Example #13
0
def test_titanic_detection():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    types_table = detect_types(titanic)
    types = types_table.T.idxmax()
    true_types = ['categorical', 'categorical', 'free_string', 'categorical',
                  'dirty_float', 'low_card_int', 'low_card_int', 'free_string',
                  'dirty_float', 'free_string', 'categorical', 'categorical',
                  'dirty_float', 'free_string']

    assert (types == true_types).all()
    titanic_clean, clean_types = clean(titanic, return_types=True)
    assert (clean_types == detect_types(titanic_clean)).all(axis=None)
    titanic_nan = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'),
                              na_values='?')
    types_table = detect_types(titanic_nan)
    types = types_table.T.idxmax()
    true_types_clean = [t if t != 'dirty_float' else 'continuous'
                        for t in true_types]
    assert (types == true_types_clean).all()
Example #14
0
def test_convert_cat_to_string():
    X = pd.DataFrame({'a': [1, 2, 3, '1', 2, 3, 'a']})
    X_clean = clean(X)
    assert len(X_clean.a.cat.categories) == 4
Example #15
0
def test_digits_type_hints():
    data = data_df_from_bunch(load_digits())
    data_clean = clean(
        data, type_hints={"x{}".format(i): 'continuous'
                          for i in range(64)})
    assert data_clean.shape[1] == 65