def check_raises_error_when_input_not_a_df(estimator):
    """
    Checks if transformer raises error when user passes input other than a pandas
    dataframe or numpy array to fit() or transform() methods.

    Functionality is provided by `is_dataframe`.
    """
    # non-permitted inputs.
    _not_a_df = [
        "not_a_df",
        [1, 2, 3, "some_data"],
        pd.Series([-2, 1.5, 8.94], name="not_a_df"),
    ]

    # permitted input
    X, y = test_df(categorical=True, datetime=True)

    transformer = clone(estimator)

    for not_df in _not_a_df:
        # test fitting not a df
        with pytest.raises(TypeError):
            transformer.fit(not_df)

        transformer.fit(X, y)
        # test transforming not a df
        with pytest.raises(TypeError):
            transformer.transform(not_df)
Exemple #2
0
def test_raises_error_when_input_not_a_df(estimator):
    # non-permitted inputs.
    _not_a_df = [
        "not_a_df",
        [1, 2, 3, "some_data"],
        pd.Series([-2, 1.5, 8.94], name="not_a_df"),
    ]

    transformer = clone(estimator)

    # Error in fit param:
    for not_df in _not_a_df:
        # test fitting not a df
        with pytest.raises(TypeError):
            transformer.fit(not_df)

    # error in transform param:
    X, y = test_df(categorical=True, datetime=True)

    if transformer.__class__.__name__ == "TargetMeanRegressor":
        # y needs to be continuous
        y = X["var_1"]
        X.drop(["var_1"], axis=1, inplace=True)

    transformer.fit(X, y)
    for not_df in _not_a_df:
        if hasattr(transformer, "_predict"):
            with pytest.raises(TypeError):
                transformer._predict(not_df)
        else:
            with pytest.raises(TypeError):
                transformer.predict(not_df)
Exemple #3
0
def test_raises_error_when_df_has_nan(df_enc, df_na, estimator):

    transformer = clone(estimator)

    X, y = test_df(categorical=True)
    X_na = X.copy()

    X_na.loc[0, "var_1"] = np.nan

    if transformer.__class__.__name__ != "TargetMeanRegressor":
        # Raise error when dataset contains na, fit method
        with pytest.raises(ValueError):
            transformer.fit(X_na, y)

        transformer.fit(X, y)
        if hasattr(transformer, "_predict"):
            with pytest.raises(ValueError):
                transformer._predict(X_na)
        else:
            with pytest.raises(ValueError):
                transformer.predict(X_na)
            with pytest.raises(ValueError):
                transformer.predict_proba(X_na)
            with pytest.raises(ValueError):
                transformer.predict_log_proba(X_na)

    else:
        y = X["var_10"]
        with pytest.raises(ValueError):
            transformer.fit(X_na, y)

        transformer.fit(X, y)
        with pytest.raises(ValueError):
            transformer.predict(X_na)
Exemple #4
0
def test_attributes_upon_fitting(_strategy, _bins, estimator):
    transformer = clone(estimator)
    transformer.set_params(bins=_bins, strategy=_strategy)

    X, y = test_df(categorical=True)

    if transformer.__class__.__name__ == "TargetMeanRegressor":
        # y needs to be continuous
        y = X["var_10"]

    transformer.fit(X, y)

    assert transformer.bins == _bins
    assert transformer.strategy == _strategy

    if _strategy == "equal_width":
        assert (type(transformer._pipeline.named_steps["discretiser"]) ==
                EqualWidthDiscretiser)
    else:
        assert (type(transformer._pipeline.named_steps["discretiser"]) ==
                EqualFrequencyDiscretiser)

    assert type(
        transformer._pipeline.named_steps["encoder_num"]) == MeanEncoder

    assert type(
        transformer._pipeline.named_steps["encoder_cat"]) == MeanEncoder
Exemple #5
0
def check_error_if_y_not_passed(estimator):
    """
    Checks that transformer raises error when y is not passed during fit. Functionality
    is provided by Python, when making a parameter mandatory.

    For this test to run, we need to add the tag 'requires_y' to the transformer.
    """
    X, y = test_df()
    estimator = clone(estimator)
    with pytest.raises(TypeError):
        estimator.fit(X)
def check_raises_non_fitted_error(estimator):
    """
    Check if transformer raises error when transform() method is called before
    calling fit() method.

    The functionality is provided by sklearn's `check_is_fitted` function.
    """
    X, y = test_df()
    transformer = clone(estimator)
    # Test when fit is not called prior to transform.
    with pytest.raises(NotFittedError):
        transformer.transform(X)
Exemple #7
0
def check_feature_names_in(estimator):
    """Checks that transformers learn the variable names of the train set used
    during fit, as well as the number of variables.

    Should be applied to all transformers.
    """
    # the estimator learns the parameters from the train set
    X, y = test_df(categorical=True, datetime=True)
    varnames = list(X.columns)
    estimator = clone(estimator)
    estimator.fit(X, y)
    assert estimator.feature_names_in_ == varnames
    assert estimator.n_features_in_ == len(varnames)
Exemple #8
0
def test_feature_names_in(estimator):

    transformer = clone(estimator)
    X, y = test_df(categorical=True)

    if transformer.__class__.__name__ == "TargetMeanRegressor":
        # y needs to be continuous
        y = X["var_10"]

    varnames = list(X.columns)

    transformer.fit(X, y)

    assert transformer.feature_names_in_ == varnames
    assert transformer.n_features_in_ == len(varnames)
def check_confirm_variables(estimator):
    """
    Only for transformers with a parameter `confirm_variables`in init.

    At the moment, this test applies to variable selection transformers. The idea is
    to corroborate if the variables entered by the user are present in the dataframe
    before doing the selection, when the parameter is True.
    """
    X, y = test_df()
    Xs = X.drop(labels=["var_10", "var_11"], axis=1)

    # original variables in X
    all_vars = ["var_" + str(i) for i in range(12)]

    estimator = clone(estimator)

    sel = estimator.set_params(
        variables=all_vars,
        confirm_variables=False,
    )
    sel.fit(X, y)
    assert sel.variables_ == all_vars

    sel = estimator.set_params(
        variables=all_vars,
        confirm_variables=True,
    )
    sel.fit(Xs, y)
    assert sel.variables_ == ["var_" + str(i) for i in range(10)]

    sel = estimator.set_params(
        variables=all_vars,
        confirm_variables=False,
    )
    with pytest.raises(KeyError):
        sel.fit(Xs, y)

    # When variables is None.
    sel = estimator.set_params(
        variables=None,
        confirm_variables=True,
    )
    sel.fit(X, y)
    assert sel.variables_ == all_vars

    sel.fit(Xs, y)
    assert sel.variables_ == ["var_" + str(i) for i in range(10)]
def check_drop_original_variables(estimator):
    """
    Only for transformers with a parameter `drop_original`in init.

    Checks correct functionality of `drop_original`. If True, the original variables,
    that is, those stored in the attribute `variables_` are dropped from the
    transformed dataframe (after transform()). If False, original variables are
    returned in the transformed dataframe.
    """
    # Test df
    X, y = test_df(categorical=True, datetime=True)

    # when drop_original is true
    estimator = clone(estimator)
    estimator.set_params(drop_original=True)
    X_tr = estimator.fit_transform(X, y)

    if hasattr(estimator, "variables_"):
        vars = estimator.variables_
    elif hasattr(estimator, "reference"):
        vars = estimator.variables + estimator.reference
    else:
        vars = estimator.variables

    # Check that original variables are not in transformed dataframe
    assert set(vars).isdisjoint(set(X_tr.columns))
    # Check that remaining variables are in transformed dataframe
    remaining = [f for f in estimator.feature_names_in_ if f not in vars]
    assert all([f in X_tr.columns for f in remaining])

    # when drop_original is False
    estimator = clone(estimator)
    estimator.set_params(drop_original=False)
    X_tr = estimator.fit_transform(X, y)

    if hasattr(estimator, "variables_"):
        vars = estimator.variables_
    else:
        vars = estimator.variables

    # Check that original variables are in transformed dataframe
    assert len([f in X_tr.columns for f in vars])
    # Check that remaining variables are in transformed dataframe
    remaining = [f for f in estimator.feature_names_in_ if f not in vars]
    assert all([f in X_tr.columns for f in remaining])
def check_raises_error_if_only_1_variable(estimator):
    """For feature selection transformers.

    Checks that the transformer has 2 or more
    variables to select from during the search procedure.
    """
    X, y = test_df()
    estimator = clone(estimator)
    sel = estimator.set_params(
        variables=["var_1"],
        confirm_variables=False,
    )

    msg = ("The selector needs at least 2 or more variables to select from. "
           "Got only 1 variable: ['var_1'].")
    with pytest.raises(ValueError) as record:
        sel.fit(X, y)

    assert str(record.value) == msg
def check_get_feature_names_out(estimator):
    """
    Check that the method get_feature_names_out() returns the variable names of
    the transformed dataframe. In most transformers that would be the same as
    the variable names in the train set used in fit(). The value is stored in
    `feature_names_in_`.

    For those transformers that return additional variables, we need to incorporate
    specific tests, based on the transformer functionality. They will be skipped from
    this test.
    """
    _skip_test = [
        "OneHotEncoder",
        "AddMissingIndicator",
        "LagFeatures",
        "WindowFeatures",
        "ExpandingWindowFeatures",
        "MathFeatures",
        "CyclicalFeatures",
        "RelativeFeatures",
        "DatetimeFeatures",
    ]

    if estimator.__class__.__name__ not in _skip_test:

        # train set
        X, y = test_df(categorical=True, datetime=True)

        # train transformer
        estimator = clone(estimator)
        estimator.fit(X, y)

        # train pipeline with transformer
        pipe = Pipeline([("transformer", clone(estimator))])
        pipe.fit(X, y)

        # feature names in train set
        feature_names = list(X.columns)

        # selection transformers
        if (hasattr(estimator, "confirm_variables")
                or estimator.__class__.__name__ == "DropFeatures"):
            feature_names = [
                f for f in feature_names
                if f not in estimator.features_to_drop_
            ]

            # take a few as input features (selectors ignore this parameter)
            input_features = [feature_names[0:3]]

            # test transformer
            assert estimator.get_feature_names_out() == feature_names
            assert estimator.get_feature_names_out(
                input_features) == feature_names
            assert estimator.transform(X).shape[1] == len(feature_names)

            # test transformer within pipeline
            assert pipe.get_feature_names_out() == feature_names
            assert pipe.get_feature_names_out(input_features) == feature_names

        elif estimator.__class__.__name__ == "MatchVariables":
            # take a few as input features (these transformers ignore this parameter)
            input_features = [feature_names[0:3]]

            # test transformer
            assert estimator.get_feature_names_out() == feature_names
            assert estimator.get_feature_names_out(
                input_features) == feature_names
            assert estimator.transform(X).shape[1] == len(feature_names)

            # test transformer within pipeline
            assert pipe.get_feature_names_out() == feature_names
            assert pipe.get_feature_names_out(input_features) == feature_names

        else:
            input_features = estimator.variables_

            # test transformer
            assert estimator.get_feature_names_out() == feature_names
            assert estimator.get_feature_names_out(
                input_features) == input_features
            assert estimator.transform(X).shape[1] == len(feature_names)

            # test transformer within pipeline
            assert pipe.get_feature_names_out() == feature_names
            assert pipe.get_feature_names_out(input_features) == input_features
def check_takes_cv_constructor(estimator):
    """
    Only for transformers with a parameter `cv`in init.

    For those transformers that implement cross-validation, checks that all
    sklearn cross-validation constructors can be used with the transformer.

    This checks corroborates that the attributes learned during fit() are indeed
    learned.
    """
    from sklearn.model_selection import KFold, StratifiedKFold

    X, y = test_df(categorical=True)

    estimator = clone(estimator)

    cv_constructor_ls = [KFold(n_splits=3), StratifiedKFold(n_splits=3), None]

    for cv_constructor in cv_constructor_ls:

        sel = estimator.set_params(cv=cv_constructor)
        sel.fit(X, y)
        Xtransformed = sel.transform(X)

        # test fit attrs
        if hasattr(sel, "initial_model_performance_"):
            assert isinstance(sel.initial_model_performance_, (int, float))

        if hasattr(sel, "features_to_drop_"):
            assert isinstance(sel.features_to_drop_, list)
            assert all([x for x in sel.features_to_drop_ if x in X.columns])
            assert len(sel.features_to_drop_) < X.shape[1]

            assert not Xtransformed.empty
            assert all([
                x for x in Xtransformed.columns
                if x not in sel.features_to_drop_
            ])

        if hasattr(sel, "performance_drifts_"):
            assert isinstance(sel.performance_drifts_, dict)
            assert all(
                [x for x in X.columns if x in sel.performance_drifts_.keys()])
            assert all([
                isinstance(sel.performance_drifts_[var], (int, float))
                for var in sel.performance_drifts_.keys()
            ])

        if hasattr(sel, "feature_performance_"):
            assert isinstance(sel.feature_performance_, dict)
            assert all(
                [x for x in X.columns if x in sel.feature_performance_.keys()])
            assert all([
                isinstance(sel.feature_performance_[var], (int, float))
                for var in sel.feature_performance_.keys()
            ])

        if hasattr(sel, "scores_dict_"):
            assert isinstance(sel.scores_dict_, dict)
            assert all([x for x in X.columns if x in sel.scores_dict_.keys()])
            assert all([
                isinstance(sel.scores_dict_[var], (int, float))
                for var in sel.scores_dict_.keys()
            ])
Exemple #14
0
def test_variable_selection(estimator):

    transformer = clone(estimator)

    X, y = test_df(categorical=True, datetime=True)

    if transformer.__class__.__name__ == "TargetMeanRegressor":
        # y needs to be continuous
        y = X["var_10"]
        X.drop(["var_10"], axis=1, inplace=True)

    # cast one variable as category
    X[["cat_var2"]] = X[["cat_var2"]].astype("category")

    # cast datetime as object
    X[["date1"]] = X[["date1"]].astype("O")

    # Case 1: numerical variable as string
    transformer.set_params(variables="var_1")
    assert transformer.variables == "var_1"

    transformer.fit(X, y)
    assert transformer.variables == "var_1"
    assert transformer.variables_categorical_ == []
    assert transformer.variables_numerical_ == ["var_1"]

    # Case 2: numerical variable as list
    transformer.set_params(variables=["var_1"])
    assert transformer.variables == ["var_1"]

    transformer.fit(X, y)
    assert transformer.variables == ["var_1"]
    assert transformer.variables_categorical_ == []
    assert transformer.variables_numerical_ == ["var_1"]

    # Case 3: categorical variable as string
    transformer.set_params(variables="cat_var1")
    assert transformer.variables == "cat_var1"

    transformer.fit(X, y)
    assert transformer.variables == "cat_var1"
    assert transformer.variables_categorical_ == ["cat_var1"]
    assert transformer.variables_numerical_ == []

    # Case 4: categorical variable as list
    transformer.set_params(variables=["cat_var1"])
    assert transformer.variables == ["cat_var1"]

    transformer.fit(X, y)
    assert transformer.variables == ["cat_var1"]
    assert transformer.variables_categorical_ == ["cat_var1"]
    assert transformer.variables_numerical_ == []

    # Case 5: numerical and categorical variables
    transformer.set_params(
        variables=["var_1", "var_2", "cat_var1", "cat_var2", "date1"])
    assert transformer.variables == [
        "var_1", "var_2", "cat_var1", "cat_var2", "date1"
    ]

    transformer.fit(X, y)
    assert transformer.variables == [
        "var_1", "var_2", "cat_var1", "cat_var2", "date1"
    ]
    assert transformer.variables_categorical_ == [
        "cat_var1", "cat_var2", "date1"
    ]
    assert transformer.variables_numerical_ == ["var_1", "var_2"]

    # Case 6: automatically select variables
    X_c = X[["var_1", "var_2", "cat_var1", "cat_var2", "date1",
             "date2"]].copy()

    transformer.set_params(variables=None)
    assert transformer.variables is None

    transformer.fit(X_c, y)
    assert transformer.variables is None
    assert transformer.variables_categorical_ == ["cat_var1", "cat_var2"]
    assert transformer.variables_numerical_ == ["var_1", "var_2"]

    transformer.set_params(variables=["var_1", "cat_var1", "date2"])
    with pytest.raises(TypeError):
        transformer.fit(X, y)

    # Case 6: user passes empty list
    transformer.set_params(variables=[])
    with pytest.raises(ValueError):
        transformer.fit(X, y)