def test_titanic_feature_names(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) ep = EasyPreprocessor() ep.fit(clean(titanic.drop('survived', axis=1))) expected_names = [ 'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous', 'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2', 'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?', 'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10', 'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B', 'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3', 'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7', 'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B', 'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0' ] assert ep.get_feature_names() == expected_names # without clean X = ep.fit_transform(titanic.drop('survived', axis=1)) # FIXME can't do that yet # assert ep.get_feature_names() == expected_names_no_clean assert not np.isnan(X).any()
def test_simple_preprocessor(): sp = EasyPreprocessor() trans = sp.fit_transform(X_cat) assert trans.shape == (3, 6) iris = load_iris() sp = EasyPreprocessor() sp.fit(iris.data)
def test_simple_preprocessor(): sp = EasyPreprocessor() sp.fit(X_cat) trans = sp.transform(X_cat) assert trans.shape == (3, 7) # FIXME should be 6? iris = load_iris() sp = EasyPreprocessor() sp.fit(iris.data)
def test_simple_preprocessor_imputed_features(): # Issue: 211 data = pd.DataFrame({'A': [0, 1, 2, 1, np.NaN]}, dtype=int) types = detect_types(data, type_hints={'A': 'categorical'}) ep = EasyPreprocessor(types=types) ep.fit(data) expected_names = ['A_0', 'A_1', 'A_2', 'A_imputed_False', 'A_imputed_True'] assert ep.get_feature_names() == expected_names
def test_simple_preprocessor_dirty_float(): dirty = pd.DataFrame(make_dirty_float()) fp = EasyPreprocessor() fp.fit(dirty) res = fp.transform(dirty) assert res.shape == (100, 3) rowsum = res.sum(axis=0) # count of "garbage" assert rowsum[1] == 1 # count of "missing" assert rowsum[2] == 9 # make sure we can transform a clean column fp.transform(pd.DataFrame(['0', '1', '2'], columns=['a_column']))
def test_titanic_feature_names(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) ep = EasyPreprocessor() ep.fit(clean(titanic.drop('survived', axis=1))) expected_names = [ 'age_dabl_continuous', 'body_dabl_continuous', 'fare_dabl_continuous', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'embarked_?', 'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10', 'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B', 'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3', 'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7', 'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B', 'boat_C', 'boat_C D', 'boat_D' ] assert ep.get_feature_names() == expected_names