def test_global_model_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est_without_global = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="min_n_obs", use_global_model=False, min_n_obs=2, ) shrink_est_with_global = GroupedPredictor( DummyRegressor(), ["Country", "City"], shrinkage="min_n_obs", use_global_model=True, min_n_obs=2, ) shrink_est_without_global.fit(X, y) # Drop planet because otherwise it is seen as a value column shrink_est_with_global.fit(X.drop(columns="Planet"), y) pd.testing.assert_series_equal( shrink_est_with_global.predict(X.drop(columns="Planet")), shrink_est_without_global.predict(X), )
def test_chickweight_raise_error_group_col_missing(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) with pytest.raises(ValueError) as e: mod.predict(df[["time", "chick"]]) assert "not in columns" in str(e)
def test_chickweight_can_do_fallback(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) assert set(mod.estimators_.keys()) == {1, 2, 3, 4} to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]}) assert mod.predict(to_predict).shape == (2,) assert mod.predict(to_predict)[0] == mod.predict(to_predict)[1]
def test_chickweight_raise_error_value_col_missing(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) with pytest.raises(ValueError): # Former test not valid anymore because we don't check for value columns # mod.predict(df[["diet", "chick"]]) mod.predict(df[["diet"]])
def test_fallback_can_raise_error(): df = load_chicken(as_frame=True) mod = GroupedPredictor( estimator=LinearRegression(), groups="diet", use_global_model=False, shrinkage=None, ) mod.fit(df[["time", "diet"]], df["weight"]) to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]}) with pytest.raises(ValueError) as e: mod.predict(to_predict) assert "found a group" in str(e)
def test_shrinkage_single_group(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est = GroupedPredictor( DummyRegressor(), "Country", shrinkage="constant", use_global_model=True, alpha=0.1, ) shrinkage_factors = np.array([0.1, 0.9]) # Drop planet and city because otherwise they are seen as value columns shrink_est.fit(X[["Country"]], y) expected_prediction = [ np.array([means["Earth"], means["NL"]]) @ shrinkage_factors, np.array([means["Earth"], means["NL"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"]]) @ shrinkage_factors, ] assert expected_prediction == shrink_est.predict(X[["Country"]]).tolist()
def test_custom_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] def shrinkage_func(group_sizes): n = len(group_sizes) return np.repeat(1 / n, n) shrink_est = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage=shrinkage_func, use_global_model=False, ) shrinkage_factors = np.array([1, 1, 1]) / 3 shrink_est.fit(X, y) expected_prediction = [ np.array([means["Earth"], means["NL"], means["Amsterdam"]]) @ shrinkage_factors, np.array([means["Earth"], means["NL"], means["Rotterdam"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"], means["Antwerp"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"], means["Brussels"]]) @ shrinkage_factors, ] assert expected_prediction == shrink_est.predict(X).tolist()
def test_constant_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="constant", use_global_model=False, alpha=0.1, ) shrinkage_factors = np.array([0.01, 0.09, 0.9]) shrink_est.fit(X, y) expected_prediction = [ np.array([means["Earth"], means["NL"], means["Amsterdam"]]) @ shrinkage_factors, np.array([means["Earth"], means["NL"], means["Rotterdam"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"], means["Antwerp"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"], means["Brussels"]]) @ shrinkage_factors, ] assert expected_prediction == shrink_est.predict(X).tolist()
def test_missing_check(): df = load_chicken(as_frame=True) X, y = df.drop(columns='weight'), df['weight'] # create missing value X.loc[0, 'chick'] = np.nan model = make_pipeline(SimpleImputer(), LinearRegression()) # Should not raise error, check is disabled m = GroupedPredictor(model, groups=['diet'], check_X=False).fit(X, y) m.predict(X) # Should raise error, check is still enabled with pytest.raises(ValueError) as e: GroupedPredictor(model, groups=['diet']).fit(X, y) assert "contains NaN" in str(e)
def test_relative_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="relative", use_global_model=False, ) shrinkage_factors = np.array([4, 2, 1]) / 7 shrink_est.fit(X, y) expected_prediction = [ np.array([means["Earth"], means["NL"], means["Amsterdam"] ]) @ shrinkage_factors, np.array([means["Earth"], means["NL"], means["Rotterdam"] ]) @ shrinkage_factors, np.array([means["Earth"], means["BE"], means["Antwerp"] ]) @ shrinkage_factors, np.array([means["Earth"], means["BE"], means["Brussels"]]) @ shrinkage_factors, ] for exp, pred in zip(expected_prediction, shrink_est.predict(X).tolist()): assert pytest.approx(exp) == pred
def test_predict_missing_group_column(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="constant", use_global_model=False, alpha=0.1, ) shrink_est.fit(X, y) with pytest.raises(ValueError) as e: shrink_est.predict(X.drop(columns=["Country"])) assert "group columns" in str(e)
def test_unseen_groups_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="constant", alpha=0.1 ) shrink_est.fit(X, y) unseen_group = pd.DataFrame( {"Planet": ["Earth"], "Country": ["DE"], "City": ["Hamburg"]} ) with pytest.raises(ValueError) as e: shrink_est.predict(X=pd.concat([unseen_group] * 4, axis=0)) assert "found a group" in str(e)
def test_predict_missing_value_column(shrinkage_data): df, means = shrinkage_data value_column = "predictor" X, y = df.drop(columns="Target"), df["Target"] X = X.assign(**{value_column: np.random.normal(size=X.shape[0])}) shrink_est = GroupedPredictor( LinearRegression(), ["Planet", "Country", "City"], shrinkage="constant", use_global_model=False, alpha=0.1, ) shrink_est.fit(X, y) with pytest.raises(ValueError) as e: shrink_est.predict(X.drop(columns=[value_column])) assert "columns to use" in str(e)
def test_min_n_obs_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] shrink_est = GroupedPredictor( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="min_n_obs", use_global_model=False, min_n_obs=2, ) shrink_est.fit(X, y) expected_prediction = [means["NL"], means["NL"], means["BE"], means["BE"]] assert expected_prediction == shrink_est.predict(X).tolist()