def test_type_hints(type_hints): X = pd.DataFrame({'a': [0, 1, 0, 1, 0], 'b': [0.1, 0.2, 0.3, 0.1, 0.1], 'c': ['a', 'b', 'a', 'b', 'a']}) types = detect_types(X, type_hints=type_hints) X_clean = clean(X, type_hints=type_hints) # dropped a column: assert X_clean.shape[1] == 2 for k, v in type_hints.items(): # detect_types respects hints assert types.T.idxmax()[k] == v # conversion successful if v == 'continuous': assert X_clean[k].dtype == np.float elif v == 'categorical': assert X_clean[k].dtype == 'category'
def test_plots_smoke(continuous_features, categorical_features, task): # simple smoke test # should be parametrized n_samples = 100 X_cont, y_cont = make_regression(n_samples=n_samples, n_features=continuous_features, n_informative=min(continuous_features, 2)) X_cat, y_cat = make_regression(n_samples=n_samples, n_features=categorical_features, n_informative=min(categorical_features, 2)) if X_cat.shape[1] > 0: X_cat = KBinsDiscretizer(encode='ordinal').fit_transform(X_cat) cont_columns = ["asdf_%d_cont" % i for i in range(continuous_features)] df_cont = pd.DataFrame(X_cont, columns=cont_columns) if categorical_features > 0: cat_columns = ["asdf_%d_cat" % i for i in range(categorical_features)] df_cat = pd.DataFrame(X_cat, columns=cat_columns).astype('int') df_cat = df_cat.astype("category") X_df = pd.concat([df_cont, df_cat], axis=1) else: X_df = df_cont assert (X_df.shape[1] == continuous_features + categorical_features) X_clean = clean(X_df.copy()) y = y_cont + y_cat if X_df.shape[1] == 0: y = np.random.uniform(size=n_samples) if task == "classification": y = np.digitize(y, np.percentile(y, [5, 10, 60, 85])) X_clean['target'] = y if task == "classification": X_clean['target'] = X_clean['target'].astype('category') types = detect_types(X_clean) column_types = types.T.idxmax() assert np.all(column_types[:continuous_features] == 'continuous') assert np.all(column_types[continuous_features:-1] == 'categorical') if task == "classification": assert column_types[-1] == 'categorical' else: assert column_types[-1] == 'continuous' plot(X_clean, target_col='target') plt.close("all")
def test_detect_low_cardinality_int(): df_all = pd.DataFrame({ 'binary_int': np.random.randint(0, 2, size=1000), 'categorical_int': np.random.randint(0, 4, size=1000), 'low_card_int_uniform': np.random.randint(0, 20, size=1000), 'low_card_int_binomial': np.random.binomial(20, .3, size=1000), 'cont_int': np.repeat(np.arange(500), 2), }) res = detect_types(df_all) types = res.T.idxmax() assert types['binary_int'] == 'categorical' assert types['categorical_int'] == 'categorical' assert types['low_card_int_uniform'] == 'low_card_int' assert types['low_card_int_binomial'] == 'low_card_int' assert types['cont_int'] == 'continuous'
def test_continuous_castable(): X = pd.DataFrame({'a': [1, 2, 3, '1', 2, 3]}) types = detect_types(X) assert types.continuous['a']
def test_detect_types(): def random_str(length=7): return "".join( [random.choice(string.ascii_letters) for i in range(length)]) near_constant_float = np.repeat(np.pi, repeats=100) near_constant_float[:2] = 0 df_all = pd.DataFrame({ 'categorical_string': ['a', 'b'] * 50, 'binary_int': np.random.randint(0, 2, size=100), 'categorical_int': np.random.randint(0, 4, size=100), 'low_card_float_int': np.random.randint(0, 4, size=100).astype(np.float), 'low_card_float': np.random.randint(0, 4, size=100).astype(np.float) + 0.1, 'binary_float': np.random.randint(0, 2, size=100).astype(np.float), 'cont_int': np.repeat(np.arange(50), 2), 'unique_string': [random_str() for i in range(100)], 'continuous_float': np.random.normal(size=100), 'constant_nan': np.repeat(np.NaN, repeats=100), 'constant_string': ['every_day'] * 100, 'constant_float': np.repeat(np.pi, repeats=100), 'near_constant_float': near_constant_float, 'index_0_based': np.arange(100), 'index_1_based': np.arange(1, 101), 'index_shuffled': np.random.permutation(100) }) with pytest.warns(UserWarning, match="Discarding near-constant"): res = detect_types(df_all) types = res.T.idxmax() assert types['categorical_string'] == 'categorical' assert types['binary_int'] == 'categorical' assert types['categorical_int'] == 'categorical' # assert types['low_card_int_binomial'] == 'continuous' assert types['low_card_float_int'] == 'categorical' # low card floats if they are not ints are continuous ? assert types['low_card_float'] == 'continuous' assert types['binary_float'] == 'categorical' assert types['cont_int'] == 'continuous' assert types['unique_string'] == 'free_string' assert types['continuous_float'] == 'continuous' assert types['constant_nan'] == 'useless' assert types['constant_string'] == 'useless' assert types['constant_float'] == 'useless' assert types['near_constant_float'] == 'useless' assert types['index_0_based'] == 'useless' assert types['index_1_based'] == 'useless' # Not detecting a shuffled index right now :-/ assert types['index_shuffled'] == 'continuous' res = detect_types(X_cat) assert len(res) == 3 assert res.categorical.all() assert ~res.continuous.any() iris = load_iris() res_iris = detect_types(pd.DataFrame(iris.data)) assert (res_iris.sum(axis=1) == 1).all() assert res_iris.continuous.sum() == 4