def test_easy_preprocessor_cat_cols():
    # Create dataframe with 1 numerical and 3 categorical features.
    cat_1_unique = ['a', 'b', 'c']
    cat_2_unique = ['D', 'E', 'F', 'G']
    cat_3_unique = [-2.0, -1.0]
    data = pd.DataFrame({
        'cat_1': ['a', 'b', 'a', '', '', 'c', 'a', 'c', 'a', 'a'],
        'cat_2': ['D', 'D', 'E', np.NaN, 'D', np.NaN, 'E', 'F', 'F', 'G'],
        'cat_3': [-2., -1., -1., -2., -1., '', -1., -2., '', np.NaN],
        'num':
        np.sin(range(10)),  # Valid continuous feature.
    })

    # Preprocess data, i.e. replace empty strings with NaNs, impute NaNs, and
    # encode categorical variables with OneHotEncoder.
    ep = EasyPreprocessor()
    data_t = ep.fit_transform(data)

    cat_all = [cat_1_unique, cat_2_unique, cat_3_unique]
    n_unique_cats = sum(len(cats) for cats in cat_all)
    # The number of features after preprocessing must be equal to
    # the number of unique categories within the dataframe + the number of
    # remaining valid features.
    assert data_t.shape[1] == n_unique_cats + 1

    cat_pipe = ep.ct_.named_transformers_['categorical']
    ohe = cat_pipe.named_steps['onehotencoder']
    # The category sets detected by OneHotEncoder inside EasyPreprocessor
    # must match the ones specified in cat_all.
    assert len(ohe.categories_) == len(cat_all)
    for cat_list, ohe_cat_list in zip(cat_all, ohe.categories_):
        assert set(cat_list) == set(ohe_cat_list)
Beispiel #2
0
def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous',
        'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3',
        'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3',
        'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2',
        'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0',
        'body_?_1.0'
    ]
    assert ep.get_feature_names() == expected_names

    # without clean
    X = ep.fit_transform(titanic.drop('survived', axis=1))
    # FIXME can't do that yet
    # assert ep.get_feature_names() == expected_names_no_clean

    assert not np.isnan(X).any()
def test_simple_preprocessor():
    sp = EasyPreprocessor()
    trans = sp.fit_transform(X_cat)
    assert trans.shape == (3, 6)

    iris = load_iris()
    sp = EasyPreprocessor()
    sp.fit(iris.data)