Ejemplo n.º 1
0
def test_type_hints(type_hints):
    X = pd.DataFrame({'a': [0, 1, 0, 1, 0],
                      'b': [0.1, 0.2, 0.3, 0.1, 0.1],
                      'c': ['a', 'b', 'a', 'b', 'a']})
    types = detect_types(X, type_hints=type_hints)
    X_clean = clean(X, type_hints=type_hints)

    # dropped a column:
    assert X_clean.shape[1] == 2

    for k, v in type_hints.items():
        # detect_types respects hints
        assert types.T.idxmax()[k] == v
        # conversion successful
        if v == 'continuous':
            assert X_clean[k].dtype == np.float
        elif v == 'categorical':
            assert X_clean[k].dtype == 'category'
Ejemplo n.º 2
0
def test_plots_smoke(continuous_features, categorical_features, task):
    # simple smoke test
    # should be parametrized
    n_samples = 100
    X_cont, y_cont = make_regression(n_samples=n_samples,
                                     n_features=continuous_features,
                                     n_informative=min(continuous_features, 2))
    X_cat, y_cat = make_regression(n_samples=n_samples,
                                   n_features=categorical_features,
                                   n_informative=min(categorical_features, 2))
    if X_cat.shape[1] > 0:
        X_cat = KBinsDiscretizer(encode='ordinal').fit_transform(X_cat)
    cont_columns = ["asdf_%d_cont" % i for i in range(continuous_features)]
    df_cont = pd.DataFrame(X_cont, columns=cont_columns)
    if categorical_features > 0:
        cat_columns = ["asdf_%d_cat" % i for i in range(categorical_features)]
        df_cat = pd.DataFrame(X_cat, columns=cat_columns).astype('int')
        df_cat = df_cat.astype("category")
        X_df = pd.concat([df_cont, df_cat], axis=1)
    else:
        X_df = df_cont
    assert (X_df.shape[1] == continuous_features + categorical_features)
    X_clean = clean(X_df.copy())
    y = y_cont + y_cat
    if X_df.shape[1] == 0:
        y = np.random.uniform(size=n_samples)
    if task == "classification":
        y = np.digitize(y, np.percentile(y, [5, 10, 60, 85]))
    X_clean['target'] = y
    if task == "classification":
        X_clean['target'] = X_clean['target'].astype('category')
    types = detect_types(X_clean)
    column_types = types.T.idxmax()
    assert np.all(column_types[:continuous_features] == 'continuous')
    assert np.all(column_types[continuous_features:-1] == 'categorical')
    if task == "classification":
        assert column_types[-1] == 'categorical'
    else:
        assert column_types[-1] == 'continuous'

    plot(X_clean, target_col='target')
    plt.close("all")
Ejemplo n.º 3
0
def test_detect_low_cardinality_int():
    df_all = pd.DataFrame({
        'binary_int':
        np.random.randint(0, 2, size=1000),
        'categorical_int':
        np.random.randint(0, 4, size=1000),
        'low_card_int_uniform':
        np.random.randint(0, 20, size=1000),
        'low_card_int_binomial':
        np.random.binomial(20, .3, size=1000),
        'cont_int':
        np.repeat(np.arange(500), 2),
    })

    res = detect_types(df_all)
    types = res.T.idxmax()

    assert types['binary_int'] == 'categorical'
    assert types['categorical_int'] == 'categorical'
    assert types['low_card_int_uniform'] == 'low_card_int'
    assert types['low_card_int_binomial'] == 'low_card_int'
    assert types['cont_int'] == 'continuous'
Ejemplo n.º 4
0
def test_continuous_castable():
    X = pd.DataFrame({'a': [1, 2, 3, '1', 2, 3]})
    types = detect_types(X)
    assert types.continuous['a']
Ejemplo n.º 5
0
def test_detect_types():
    def random_str(length=7):
        return "".join(
            [random.choice(string.ascii_letters) for i in range(length)])

    near_constant_float = np.repeat(np.pi, repeats=100)
    near_constant_float[:2] = 0

    df_all = pd.DataFrame({
        'categorical_string': ['a', 'b'] * 50,
        'binary_int':
        np.random.randint(0, 2, size=100),
        'categorical_int':
        np.random.randint(0, 4, size=100),
        'low_card_float_int':
        np.random.randint(0, 4, size=100).astype(np.float),
        'low_card_float':
        np.random.randint(0, 4, size=100).astype(np.float) + 0.1,
        'binary_float':
        np.random.randint(0, 2, size=100).astype(np.float),
        'cont_int':
        np.repeat(np.arange(50), 2),
        'unique_string': [random_str() for i in range(100)],
        'continuous_float':
        np.random.normal(size=100),
        'constant_nan':
        np.repeat(np.NaN, repeats=100),
        'constant_string': ['every_day'] * 100,
        'constant_float':
        np.repeat(np.pi, repeats=100),
        'near_constant_float':
        near_constant_float,
        'index_0_based':
        np.arange(100),
        'index_1_based':
        np.arange(1, 101),
        'index_shuffled':
        np.random.permutation(100)
    })
    with pytest.warns(UserWarning, match="Discarding near-constant"):
        res = detect_types(df_all)
    types = res.T.idxmax()
    assert types['categorical_string'] == 'categorical'
    assert types['binary_int'] == 'categorical'
    assert types['categorical_int'] == 'categorical'
    # assert types['low_card_int_binomial'] == 'continuous'
    assert types['low_card_float_int'] == 'categorical'
    # low card floats if they are not ints are continuous ?
    assert types['low_card_float'] == 'continuous'
    assert types['binary_float'] == 'categorical'
    assert types['cont_int'] == 'continuous'
    assert types['unique_string'] == 'free_string'
    assert types['continuous_float'] == 'continuous'
    assert types['constant_nan'] == 'useless'
    assert types['constant_string'] == 'useless'
    assert types['constant_float'] == 'useless'
    assert types['near_constant_float'] == 'useless'
    assert types['index_0_based'] == 'useless'
    assert types['index_1_based'] == 'useless'
    # Not detecting a shuffled index right now :-/
    assert types['index_shuffled'] == 'continuous'

    res = detect_types(X_cat)
    assert len(res) == 3
    assert res.categorical.all()
    assert ~res.continuous.any()

    iris = load_iris()
    res_iris = detect_types(pd.DataFrame(iris.data))
    assert (res_iris.sum(axis=1) == 1).all()
    assert res_iris.continuous.sum() == 4