def test_target_imputer_woodwork_custom_overrides_returned_by_components( y_pd, has_nan, impute_strategy): y_to_use = y_pd.copy() if has_nan: y_to_use[len(y_pd) - 1] = np.nan override_types = [Integer, Double, Categorical, Boolean] for logical_type in override_types: try: y = ww.DataColumn(y_to_use.copy(), logical_type=logical_type) except TypeError: continue impute_strategy_to_use = impute_strategy if logical_type in [Categorical, NaturalLanguage]: impute_strategy_to_use = "most_frequent" imputer = TargetImputer(impute_strategy=impute_strategy_to_use) imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert isinstance(y_t, ww.DataColumn) if impute_strategy_to_use == "most_frequent" or not has_nan: assert y_t.logical_type == logical_type else: assert y_t.logical_type == Double
def test_target_imputer_all_bool_return_original(data_type, make_data_type): y = pd.Series([True, True, False, True, True], dtype=bool) y = make_data_type(data_type, y) y_expected = pd.Series([True, True, False, True, True], dtype='boolean') imputer = TargetImputer() imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert_series_equal(y_expected, y_t.to_series())
def test_target_imputer_no_y(X_y_binary): X, y = X_y_binary imputer = TargetImputer() assert imputer.fit_transform(None, None) == (None, None) imputer = TargetImputer() imputer.fit(None, None) assert imputer.transform(None, None) == (None, None)
def test_target_imputer_boolean_dtype(data_type, make_data_type): y = pd.Series([True, np.nan, False, np.nan, True], dtype='boolean') y_expected = pd.Series([True, True, False, True, True], dtype='boolean') y = make_data_type(data_type, y) imputer = TargetImputer() imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert_series_equal(y_expected, y_t.to_series())
def test_target_imputer_fit_transform_all_nan_empty(): y = pd.Series([np.nan, np.nan]) imputer = TargetImputer() imputer.fit(None, y) with pytest.raises(RuntimeError, match="Transformed data is empty"): imputer.transform(None, y) imputer = TargetImputer() with pytest.raises(RuntimeError, match="Transformed data is empty"): imputer.fit_transform(None, y)
def test_target_imputer_col_with_non_numeric_with_numeric_strategy(): y = pd.Series([np.nan, "a", "b"]) imputer = TargetImputer(impute_strategy='mean') with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): imputer.fit_transform(None, y) with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): imputer.fit(None, y) imputer = TargetImputer(impute_strategy='median') with pytest.raises( ValueError, match="Cannot use median strategy with non-numeric data"): imputer.fit_transform(None, y) with pytest.raises( ValueError, match="Cannot use median strategy with non-numeric data"): imputer.fit(None, y)
def test_target_imputer_does_not_reset_index(): y = pd.Series(np.arange(10)) y[5] = np.nan assert y.index.tolist() == list(range(10)) y.drop(0, inplace=True) pd.testing.assert_series_equal( pd.Series([1, 2, 3, 4, np.nan, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), y) imputer = TargetImputer(impute_strategy="mean") imputer.fit(None, y=y) _, y_t = imputer.transform(None, y) pd.testing.assert_series_equal( pd.Series([1.0, 2, 3, 4, 5, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), y_t.to_series())