def test_target_imputer_numpy_input(): y = np.array([np.nan, 0, 2]) imputer = TargetImputer(impute_strategy='mean') y_expected = np.array([1, 0, 2]) _, y_t = imputer.fit_transform(None, y) assert np.allclose(y_expected, y_t.to_series()) np.testing.assert_almost_equal(y, np.array([np.nan, 0, 2]))
def test_target_imputer_woodwork_custom_overrides_returned_by_components( y_pd, has_nan, impute_strategy): y_to_use = y_pd.copy() if has_nan: y_to_use[len(y_pd) - 1] = np.nan override_types = [Integer, Double, Categorical, Boolean] for logical_type in override_types: try: y = ww.DataColumn(y_to_use.copy(), logical_type=logical_type) except TypeError: continue impute_strategy_to_use = impute_strategy if logical_type in [Categorical, NaturalLanguage]: impute_strategy_to_use = "most_frequent" imputer = TargetImputer(impute_strategy=impute_strategy_to_use) imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert isinstance(y_t, ww.DataColumn) if impute_strategy_to_use == "most_frequent" or not has_nan: assert y_t.logical_type == logical_type else: assert y_t.logical_type == Double
def test_target_imputer_all_bool_return_original(data_type, make_data_type): y = pd.Series([True, True, False, True, True], dtype=bool) y = make_data_type(data_type, y) y_expected = pd.Series([True, True, False, True, True], dtype='boolean') imputer = TargetImputer() imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert_series_equal(y_expected, y_t.to_series())
def test_target_imputer_boolean_dtype(data_type, make_data_type): y = pd.Series([True, np.nan, False, np.nan, True], dtype='boolean') y_expected = pd.Series([True, True, False, True, True], dtype='boolean') y = make_data_type(data_type, y) imputer = TargetImputer() imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert_series_equal(y_expected, y_t.to_series())
def test_target_imputer_with_X(): X = pd.DataFrame({"some col": [1, 3, np.nan]}) y = pd.Series([np.nan, 1, 3]) imputer = TargetImputer(impute_strategy='median') y_expected = pd.Series([2, 1, 3]) X_expected = pd.DataFrame({"some col": [1, 3, np.nan]}) X_t, y_t = imputer.fit_transform(X, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False) assert_frame_equal(X_expected, X_t.to_dataframe(), check_dtype=False)
def test_target_imputer_fit_transform_all_nan_empty(): y = pd.Series([np.nan, np.nan]) imputer = TargetImputer() imputer.fit(None, y) with pytest.raises(RuntimeError, match="Transformed data is empty"): imputer.transform(None, y) imputer = TargetImputer() with pytest.raises(RuntimeError, match="Transformed data is empty"): imputer.fit_transform(None, y)
def test_target_imputer_no_y(X_y_binary): X, y = X_y_binary imputer = TargetImputer() assert imputer.fit_transform(None, None) == (None, None) imputer = TargetImputer() imputer.fit(None, None) assert imputer.transform(None, None) == (None, None)
def test_make_component_list_from_actions(): assert _make_component_list_from_actions([]) == [] actions = [ DataCheckAction(DataCheckActionCode.DROP_COL, {"columns": ['some col']}) ] assert _make_component_list_from_actions(actions) == [ DropColumns(columns=['some col']) ] actions = [ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"columns": ['some col']}), DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": "most_frequent" }) ] assert _make_component_list_from_actions(actions) == [ DropColumns(columns=['some col']), TargetImputer(impute_strategy="most_frequent") ]
def test_target_imputer_does_not_reset_index(): y = pd.Series(np.arange(10)) y[5] = np.nan assert y.index.tolist() == list(range(10)) y.drop(0, inplace=True) pd.testing.assert_series_equal( pd.Series([1, 2, 3, 4, np.nan, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), y) imputer = TargetImputer(impute_strategy="mean") imputer.fit(None, y=y) _, y_t = imputer.transform(None, y) pd.testing.assert_series_equal( pd.Series([1.0, 2, 3, 4, 5, 6, 7, 8, 9], dtype=float, index=list(range(1, 10))), y_t.to_series())
def test_target_imputer_most_frequent(): y = pd.Series([np.nan, "a", "b"]) imputer = TargetImputer(impute_strategy='most_frequent') y_expected = pd.Series(["a", "a", "b"]).astype("category") _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False) y = pd.Series([np.nan, 1, 1, 2]) imputer = TargetImputer(impute_strategy='most_frequent') y_expected = pd.Series([1, 1, 1, 2]) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
def _make_component_list_from_actions(actions): """ Creates a list of components from the input DataCheckAction list Arguments: actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components Returns: List of components used to address the input actions """ components = [] for action in actions: if action.action_code == DataCheckActionCode.DROP_COL: components.append(DropColumns(columns=action.metadata["columns"])) if action.action_code == DataCheckActionCode.IMPUTE_COL: metadata = action.metadata if metadata["is_target"]: components.append(TargetImputer(impute_strategy=metadata["impute_strategy"])) return components
def test_target_imputer_col_with_non_numeric_with_numeric_strategy(): y = pd.Series([np.nan, "a", "b"]) imputer = TargetImputer(impute_strategy='mean') with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): imputer.fit_transform(None, y) with pytest.raises(ValueError, match="Cannot use mean strategy with non-numeric data"): imputer.fit(None, y) imputer = TargetImputer(impute_strategy='median') with pytest.raises( ValueError, match="Cannot use median strategy with non-numeric data"): imputer.fit_transform(None, y) with pytest.raises( ValueError, match="Cannot use median strategy with non-numeric data"): imputer.fit(None, y)
def test_target_imputer_constant(fill_value, y, y_expected): imputer = TargetImputer(impute_strategy='constant', fill_value=fill_value) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
def test_target_imputer_mean(): y = pd.Series([np.nan, 2, 0]) imputer = TargetImputer(impute_strategy='mean') y_expected = pd.Series([1, 2, 0]) _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
def test_target_imputer_with_none_non_numeric(y, y_expected): imputer = TargetImputer() _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
def test_target_imputer_with_none(y, y_expected): imputer = TargetImputer(impute_strategy="mean") _, y_t = imputer.fit_transform(None, y) assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)