def test_duplicate_columns(): X = pd.DataFrame([[0, 1]], columns=['a', 'a']) with pytest.raises(ValueError, match="Duplicate Columns"): clean(X) with pytest.raises(ValueError, match="Duplicate Columns"): detect_types(X)
def test_dirty_float_target_regression(): titanic_data = load_titanic() data = pd.DataFrame({'one': np.repeat(np.arange(50), 2)}) dirty = make_dirty_float() data['target'] = dirty with pytest.warns(UserWarning, match="Discarding dirty_float targets that " "cannot be converted to float."): clean(data, target_col="target") with pytest.warns(UserWarning, match="Discarding dirty_float targets that " "cannot be converted to float."): plot(data, target_col="target") # check if works for non dirty_float targets plot(titanic_data, 'survived')
def test_dirty_float_single_warning(): with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') rng = np.random.RandomState(0) cont_clean = ["{:2.2f}".format(x) for x in rng.uniform(size=100)] dirty3 = pd.Series(cont_clean) dirty3[::20] = [("missing", "but weird")] * 5 X = pd.DataFrame({'dirty3': dirty3}) clean(X) assert len(w) == 1
def test_titanic_feature_names(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) ep = EasyPreprocessor() ep.fit(clean(titanic.drop('survived', axis=1))) expected_names = [ 'sibsp', 'parch', 'age_dabl_continuous', 'fare_dabl_continuous', 'body_dabl_continuous', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5', 'sibsp_8', 'parch_0', 'parch_1', 'parch_2', 'parch_3', 'parch_4', 'parch_5', 'parch_6', 'parch_9', 'embarked_?', 'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10', 'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B', 'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3', 'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7', 'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B', 'boat_C', 'boat_C D', 'boat_D', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0' ] assert ep.get_feature_names() == expected_names # without clean X = ep.fit_transform(titanic.drop('survived', axis=1)) # FIXME can't do that yet # assert ep.get_feature_names() == expected_names_no_clean assert not np.isnan(X).any()
def test_continuous_castable(): X = pd.DataFrame({'a': [1, 2, 3, '1', 2, 3, '1.1']}) types = detect_types(X) assert types.continuous['a'] X_new = clean(X) assert X_new.dtypes['a'] == np.float64
def test_duplicate_index(): X = X_cat.copy() X.index = np.ones(len(X), np.int) assert not X.index.is_unique with pytest.raises(ValueError): detect_types(X) with pytest.warns(UserWarning): X_clean = clean(X) assert X_clean.index.is_unique
def test_easy_preprocessor_transform(): titanic = load_titanic() titanic_clean = clean(titanic) X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=42) pipe = make_pipeline(EasyPreprocessor(), LogisticRegression(C=0.1)) pipe.fit(X_train, y_train) pipe.predict(X_train) pipe.predict(X_val)
def test_digits_type_hints(): data_bunch = load_digits() try: feature_names = data_bunch.feature_names except AttributeError: feature_names = ['x%d' % i for i in range(data_bunch.data.shape[1])] data = data_df_from_bunch(data_bunch) data_clean = clean(data, type_hints={ feature: 'continuous' for feature in feature_names}) assert data_clean.shape[1] == 65
def test_detect_string_floats(): # test if we can find floats that are encoded as strings # sometimes they have weird missing values rng = np.random.RandomState(0) cont_clean = ["{:2.2f}".format(x) for x in rng.uniform(size=100)] dirty = pd.Series(cont_clean) # not strings, but actually numbers! dirty2 = pd.Series(rng.uniform(size=100)) # FIXME this wouldn't work with using straight floats dirty3 = pd.Series(cont_clean) too_dirty = pd.Series(rng.uniform(size=100)) # FIXME hardcoded frequency of tolerated missing # FIXME add test with integers # FIXME whitespace? dirty[::12] = "missing" dirty2[::12] = "missing" dirty3[::20] = [("missing", "but weird")] * 5 too_dirty[::2] = rng.choice(list(string.ascii_letters), size=50) # only dirty: res = detect_types(pd.DataFrame(dirty)) assert len(res) == 1 assert res.dirty_float[0] # dirty and clean and weird stuff X = pd.DataFrame({ 'cont': cont_clean, 'dirty': dirty, 'dirty2': dirty2, 'dirty3': dirty3, 'too_dirty': too_dirty }) res = detect_types(X) assert len(res) == 5 assert res.continuous['cont'] assert ~res.continuous['dirty'] assert ~res.continuous['dirty2'] assert ~res.continuous['dirty3'] assert ~res.dirty_float['cont'] assert res.dirty_float['dirty'] assert res.dirty_float['dirty2'] assert res.dirty_float['dirty3'] assert ~res.dirty_float['too_dirty'] assert ~res.free_string['dirty3'] assert res.free_string['too_dirty'] assert _float_matching(X.cont).all() is_float = X.dirty != 'missing' assert (_float_matching(X.dirty) == is_float).all() assert (_float_matching(X.dirty2) == is_float).all() assert (_float_matching(X.dirty3) == (X.dirty3.map(type) == str)).all() res = clean(X)
def test_titanic_feature_names(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) ep = EasyPreprocessor() ep.fit(clean(titanic.drop('survived', axis=1))) expected_names = [ 'age_dabl_continuous', 'body_dabl_continuous', 'fare_dabl_continuous', 'age_?_0.0', 'age_?_1.0', 'body_?_0.0', 'body_?_1.0', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male', 'embarked_?', 'embarked_C', 'embarked_Q', 'embarked_S', 'boat_1', 'boat_10', 'boat_11', 'boat_12', 'boat_13', 'boat_13 15', 'boat_13 15 B', 'boat_14', 'boat_15', 'boat_15 16', 'boat_16', 'boat_2', 'boat_3', 'boat_4', 'boat_5', 'boat_5 7', 'boat_5 9', 'boat_6', 'boat_7', 'boat_8', 'boat_8 10', 'boat_9', 'boat_?', 'boat_A', 'boat_B', 'boat_C', 'boat_C D', 'boat_D' ] assert ep.get_feature_names() == expected_names
def test_type_hints(type_hints): X = pd.DataFrame({'a': [0, 1, 0, 1, 0], 'b': [0.1, 0.2, 0.3, 0.1, 0.1], 'c': ['a', 'b', 'a', 'b', 'a']}) types = detect_types(X, type_hints=type_hints) X_clean = clean(X, type_hints=type_hints) # dropped a column: assert X_clean.shape[1] == 2 for k, v in type_hints.items(): # detect_types respects hints assert types.T.idxmax()[k] == v # conversion successful if v == 'continuous': assert X_clean[k].dtype == np.float elif v == 'categorical': assert X_clean[k].dtype == 'category'
def test_plots_smoke(continuous_features, categorical_features, task): # simple smoke test # should be parametrized n_samples = 100 X_cont, y_cont = make_regression(n_samples=n_samples, n_features=continuous_features, n_informative=min(continuous_features, 2)) X_cat, y_cat = make_regression(n_samples=n_samples, n_features=categorical_features, n_informative=min(categorical_features, 2)) if X_cat.shape[1] > 0: X_cat = KBinsDiscretizer(encode='ordinal').fit_transform(X_cat) cont_columns = ["asdf_%d_cont" % i for i in range(continuous_features)] df_cont = pd.DataFrame(X_cont, columns=cont_columns) if categorical_features > 0: cat_columns = ["asdf_%d_cat" % i for i in range(categorical_features)] df_cat = pd.DataFrame(X_cat, columns=cat_columns).astype('int') df_cat = df_cat.astype("category") X_df = pd.concat([df_cont, df_cat], axis=1) else: X_df = df_cont assert (X_df.shape[1] == continuous_features + categorical_features) X_clean = clean(X_df.copy()) y = y_cont + y_cat if X_df.shape[1] == 0: y = np.random.uniform(size=n_samples) if task == "classification": y = np.digitize(y, np.percentile(y, [5, 10, 60, 85])) X_clean['target'] = y if task == "classification": X_clean['target'] = X_clean['target'].astype('category') types = detect_types(X_clean) column_types = types.T.idxmax() assert np.all(column_types[:continuous_features] == 'continuous') assert np.all(column_types[continuous_features:-1] == 'categorical') if task == "classification": assert column_types[-1] == 'categorical' else: assert column_types[-1] == 'continuous' plot(X_clean, target_col='target') plt.close("all")
def test_titanic_detection(): path = os.path.dirname(__file__) titanic = pd.read_csv(os.path.join(path, '../datasets/titanic.csv')) types_table = detect_types(titanic) types = types_table.T.idxmax() true_types = ['categorical', 'categorical', 'free_string', 'categorical', 'dirty_float', 'low_card_int', 'low_card_int', 'free_string', 'dirty_float', 'free_string', 'categorical', 'categorical', 'dirty_float', 'free_string'] assert (types == true_types).all() titanic_clean, clean_types = clean(titanic, return_types=True) assert (clean_types == detect_types(titanic_clean)).all(axis=None) titanic_nan = pd.read_csv(os.path.join(path, '../datasets/titanic.csv'), na_values='?') types_table = detect_types(titanic_nan) types = types_table.T.idxmax() true_types_clean = [t if t != 'dirty_float' else 'continuous' for t in true_types] assert (types == true_types_clean).all()
def test_convert_cat_to_string(): X = pd.DataFrame({'a': [1, 2, 3, '1', 2, 3, 'a']}) X_clean = clean(X) assert len(X_clean.a.cat.categories) == 4
def test_digits_type_hints(): data = data_df_from_bunch(load_digits()) data_clean = clean( data, type_hints={"x{}".format(i): 'continuous' for i in range(64)}) assert data_clean.shape[1] == 65