def test_TargetEncoderRegressor(cv, noise_level): df = get_sample_df(100) df["cat_col"] = df["text_col"].apply(lambda s: s[0:3]) np.random.seed(123) y = np.random.randn(100) encoder = TargetEncoderRegressor(noise_level=noise_level, cv=cv) encoder.fit(df, y) res = encoder.transform(df) assert encoder.get_feature_names() == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert list(res.columns) == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert res["cat_col__target_mean"].isnull().sum() == 0 assert (res.index == df.index).all() assert encoder._columns_informations["input_columns"] == ["cat_col"] temp = pd.DataFrame({"cat_col": df["cat_col"], "cat_col__target_mean": res["cat_col__target_mean"]}) assert temp.groupby("cat_col")["cat_col__target_mean"].std().max() == 0 encoder = TargetEncoderRegressor(noise_level=noise_level, cv=cv) res = encoder.fit_transform(df, y) assert encoder.get_feature_names() == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert list(res.columns) == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert res["cat_col__target_mean"].isnull().sum() == 0 assert (res.index == df.index).all() assert encoder._columns_informations["input_columns"] == ["cat_col"]
def test_TargetEncoderRegressor_is_picklable(): df = get_sample_df(100) df["cat_col"] = df["text_col"].apply(lambda s: s[0:3]) np.random.seed(123) y = np.random.randn(100) encoder = TargetEncoderRegressor(cv=2) encoder.fit(df, y) pickled_encoder = pickle.dumps(encoder) unpickled_encoder = pickle.loads(pickled_encoder) assert type(unpickled_encoder) == type(encoder) X1 = encoder.transform(df) X2 = unpickled_encoder.transform(df) assert X1.shape == X2.shape assert (X1 == X2).all().all()