Exemple #1
0
def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous',
        'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3',
        'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3',
        'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2',
        'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0',
        'body_?_1.0'
    ]
    try:
        assert ep.get_feature_names() == expected_names
    except AssertionError:
        # OHE uses int in newer versions
        expected_names[57] = 'age_?_0'
        expected_names[58] = 'age_?_1'
        expected_names[59] = 'body_?_0'
        expected_names[60] = 'body_?_1'
        assert ep.get_feature_names() == expected_names

    # without clean
    X = ep.fit_transform(titanic.drop('survived', axis=1))
    # FIXME can't do that yet
    # assert ep.get_feature_names() == expected_names_no_clean

    assert not np.isnan(X).any()
Exemple #2
0
def test_simple_preprocessor_imputed_features():
    # Issue: 211

    data = pd.DataFrame({'A': [0, 1, 2, 1, np.NaN]}, dtype=int)
    types = detect_types(data, type_hints={'A': 'categorical'})

    ep = EasyPreprocessor(types=types)
    ep.fit(data)

    expected_names = ['A_0', 'A_1', 'A_2', 'A_imputed_False', 'A_imputed_True']
    assert ep.get_feature_names() == expected_names
def test_titanic_feature_names():
    path = os.path.dirname(__file__)
    titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'))
    ep = EasyPreprocessor()
    ep.fit(clean(titanic.drop('survived', axis=1)))
    expected_names = [
        'age_dabl_continuous', 'body_dabl_continuous', 'fare_dabl_continuous',
        'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0', 'pclass_1',
        'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'embarked_?',
        'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10',
        'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B',
        'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3',
        'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7',
        'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B',
        'boat_C', 'boat_C D', 'boat_D'
    ]
    assert ep.get_feature_names() == expected_names