Esempio n. 1
0
def test_target_imputer_numpy_input():
    y = np.array([np.nan, 0, 2])
    imputer = TargetImputer(impute_strategy='mean')
    y_expected = np.array([1, 0, 2])
    _, y_t = imputer.fit_transform(None, y)
    assert np.allclose(y_expected, y_t.to_series())
    np.testing.assert_almost_equal(y, np.array([np.nan, 0, 2]))
Esempio n. 2
0
def test_target_imputer_woodwork_custom_overrides_returned_by_components(
        y_pd, has_nan, impute_strategy):
    y_to_use = y_pd.copy()
    if has_nan:
        y_to_use[len(y_pd) - 1] = np.nan
    override_types = [Integer, Double, Categorical, Boolean]
    for logical_type in override_types:
        try:
            y = ww.DataColumn(y_to_use.copy(), logical_type=logical_type)
        except TypeError:
            continue

        impute_strategy_to_use = impute_strategy
        if logical_type in [Categorical, NaturalLanguage]:
            impute_strategy_to_use = "most_frequent"

        imputer = TargetImputer(impute_strategy=impute_strategy_to_use)
        imputer.fit(None, y)
        _, y_t = imputer.transform(None, y)
        assert isinstance(y_t, ww.DataColumn)

        if impute_strategy_to_use == "most_frequent" or not has_nan:
            assert y_t.logical_type == logical_type
        else:
            assert y_t.logical_type == Double
Esempio n. 3
0
def test_target_imputer_all_bool_return_original(data_type, make_data_type):
    y = pd.Series([True, True, False, True, True], dtype=bool)
    y = make_data_type(data_type, y)
    y_expected = pd.Series([True, True, False, True, True], dtype='boolean')
    imputer = TargetImputer()
    imputer.fit(None, y)
    _, y_t = imputer.transform(None, y)
    assert_series_equal(y_expected, y_t.to_series())
Esempio n. 4
0
def test_target_imputer_boolean_dtype(data_type, make_data_type):
    y = pd.Series([True, np.nan, False, np.nan, True], dtype='boolean')
    y_expected = pd.Series([True, True, False, True, True], dtype='boolean')
    y = make_data_type(data_type, y)
    imputer = TargetImputer()
    imputer.fit(None, y)
    _, y_t = imputer.transform(None, y)
    assert_series_equal(y_expected, y_t.to_series())
Esempio n. 5
0
def test_target_imputer_with_X():
    X = pd.DataFrame({"some col": [1, 3, np.nan]})
    y = pd.Series([np.nan, 1, 3])
    imputer = TargetImputer(impute_strategy='median')
    y_expected = pd.Series([2, 1, 3])
    X_expected = pd.DataFrame({"some col": [1, 3, np.nan]})
    X_t, y_t = imputer.fit_transform(X, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
    assert_frame_equal(X_expected, X_t.to_dataframe(), check_dtype=False)
Esempio n. 6
0
def test_target_imputer_fit_transform_all_nan_empty():
    y = pd.Series([np.nan, np.nan])

    imputer = TargetImputer()
    imputer.fit(None, y)
    with pytest.raises(RuntimeError, match="Transformed data is empty"):
        imputer.transform(None, y)

    imputer = TargetImputer()
    with pytest.raises(RuntimeError, match="Transformed data is empty"):
        imputer.fit_transform(None, y)
Esempio n. 7
0
def test_target_imputer_no_y(X_y_binary):
    X, y = X_y_binary
    imputer = TargetImputer()
    assert imputer.fit_transform(None, None) == (None, None)

    imputer = TargetImputer()
    imputer.fit(None, None)
    assert imputer.transform(None, None) == (None, None)
def test_make_component_list_from_actions():
    assert _make_component_list_from_actions([]) == []

    actions = [
        DataCheckAction(DataCheckActionCode.DROP_COL,
                        {"columns": ['some col']})
    ]
    assert _make_component_list_from_actions(actions) == [
        DropColumns(columns=['some col'])
    ]

    actions = [
        DataCheckAction(DataCheckActionCode.DROP_COL,
                        metadata={"columns": ['some col']}),
        DataCheckAction(DataCheckActionCode.IMPUTE_COL,
                        metadata={
                            "column": None,
                            "is_target": True,
                            "impute_strategy": "most_frequent"
                        })
    ]
    assert _make_component_list_from_actions(actions) == [
        DropColumns(columns=['some col']),
        TargetImputer(impute_strategy="most_frequent")
    ]
Esempio n. 9
0
def test_target_imputer_does_not_reset_index():
    y = pd.Series(np.arange(10))
    y[5] = np.nan
    assert y.index.tolist() == list(range(10))

    y.drop(0, inplace=True)
    pd.testing.assert_series_equal(
        pd.Series([1, 2, 3, 4, np.nan, 6, 7, 8, 9],
                  dtype=float,
                  index=list(range(1, 10))), y)

    imputer = TargetImputer(impute_strategy="mean")
    imputer.fit(None, y=y)
    _, y_t = imputer.transform(None, y)
    pd.testing.assert_series_equal(
        pd.Series([1.0, 2, 3, 4, 5, 6, 7, 8, 9],
                  dtype=float,
                  index=list(range(1, 10))), y_t.to_series())
Esempio n. 10
0
def test_target_imputer_most_frequent():
    y = pd.Series([np.nan, "a", "b"])
    imputer = TargetImputer(impute_strategy='most_frequent')
    y_expected = pd.Series(["a", "a", "b"]).astype("category")
    _, y_t = imputer.fit_transform(None, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)

    y = pd.Series([np.nan, 1, 1, 2])
    imputer = TargetImputer(impute_strategy='most_frequent')
    y_expected = pd.Series([1, 1, 1, 2])
    _, y_t = imputer.fit_transform(None, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
Esempio n. 11
0
def _make_component_list_from_actions(actions):
    """
    Creates a list of components from the input DataCheckAction list

    Arguments:
        actions (list(DataCheckAction)): List of DataCheckAction objects used to create list of components

    Returns:
        List of components used to address the input actions
    """
    components = []
    for action in actions:
        if action.action_code == DataCheckActionCode.DROP_COL:
            components.append(DropColumns(columns=action.metadata["columns"]))
        if action.action_code == DataCheckActionCode.IMPUTE_COL:
            metadata = action.metadata
            if metadata["is_target"]:
                components.append(TargetImputer(impute_strategy=metadata["impute_strategy"]))
    return components
Esempio n. 12
0
def test_target_imputer_col_with_non_numeric_with_numeric_strategy():
    y = pd.Series([np.nan, "a", "b"])
    imputer = TargetImputer(impute_strategy='mean')
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        imputer.fit_transform(None, y)
    with pytest.raises(ValueError,
                       match="Cannot use mean strategy with non-numeric data"):
        imputer.fit(None, y)
    imputer = TargetImputer(impute_strategy='median')
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        imputer.fit_transform(None, y)
    with pytest.raises(
            ValueError,
            match="Cannot use median strategy with non-numeric data"):
        imputer.fit(None, y)
Esempio n. 13
0
def test_target_imputer_constant(fill_value, y, y_expected):
    imputer = TargetImputer(impute_strategy='constant', fill_value=fill_value)
    _, y_t = imputer.fit_transform(None, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
Esempio n. 14
0
def test_target_imputer_mean():
    y = pd.Series([np.nan, 2, 0])
    imputer = TargetImputer(impute_strategy='mean')
    y_expected = pd.Series([1, 2, 0])
    _, y_t = imputer.fit_transform(None, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
Esempio n. 15
0
def test_target_imputer_with_none_non_numeric(y, y_expected):
    imputer = TargetImputer()
    _, y_t = imputer.fit_transform(None, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)
Esempio n. 16
0
def test_target_imputer_with_none(y, y_expected):
    imputer = TargetImputer(impute_strategy="mean")
    _, y_t = imputer.fit_transform(None, y)
    assert_series_equal(y_expected, y_t.to_series(), check_dtype=False)