def test_TargetEncoderRegressor(cv, noise_level): df = get_sample_df(100) df["cat_col"] = df["text_col"].apply(lambda s: s[0:3]) np.random.seed(123) y = np.random.randn(100) encoder = TargetEncoderRegressor(noise_level=noise_level, cv=cv) encoder.fit(df, y) res = encoder.transform(df) assert encoder.get_feature_names() == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert list(res.columns) == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert res["cat_col__target_mean"].isnull().sum() == 0 assert (res.index == df.index).all() assert encoder._columns_informations["input_columns"] == ["cat_col"] temp = pd.DataFrame({"cat_col": df["cat_col"], "cat_col__target_mean": res["cat_col__target_mean"]}) assert temp.groupby("cat_col")["cat_col__target_mean"].std().max() == 0 encoder = TargetEncoderRegressor(noise_level=noise_level, cv=cv) res = encoder.fit_transform(df, y) assert encoder.get_feature_names() == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert list(res.columns) == ["float_col", "int_col", "text_col", "cat_col__target_mean"] assert res["cat_col__target_mean"].isnull().sum() == 0 assert (res.index == df.index).all() assert encoder._columns_informations["input_columns"] == ["cat_col"]
def test_TargetEncoderRegressor_is_picklable(): df = get_sample_df(100) df["cat_col"] = df["text_col"].apply(lambda s: s[0:3]) np.random.seed(123) y = np.random.randn(100) encoder = TargetEncoderRegressor(cv=2) encoder.fit(df, y) pickled_encoder = pickle.dumps(encoder) unpickled_encoder = pickle.loads(pickled_encoder) assert type(unpickled_encoder) == type(encoder) X1 = encoder.transform(df) X2 = unpickled_encoder.transform(df) assert X1.shape == X2.shape assert (X1 == X2).all().all()
def test_TargetEncoderRegressor_columns_to_encode_object(): np.random.seed(123) Xnum = np.random.randn(1000, 10) dfX = pd.DataFrame(Xnum, columns=["col_%d" % i for i in range(10)]) dfX["object_column"] = ["string_%2.4f" % x for x in dfX["col_0"]] y = np.random.randn(1000) # with --object-- encoder = TargetEncoderRegressor(columns_to_encode="--object--") dfX_enc = encoder.fit_transform(dfX, y) assert not (dfX_enc.dtypes == "object").any() # with default behavior encoder = TargetEncoderRegressor() dfX_enc = encoder.fit_transform(dfX, y) assert "object_column" in dfX_enc assert (dfX_enc["object_column"] == dfX["object_column"]).all()