def test_classification(df_normal_dist):

    transformer = DecisionTreeDiscretiser(
        cv=3,
        scoring="roc_auc",
        variables=None,
        param_grid={"max_depth": [1, 2, 3, 4]},
        regression=False,
        random_state=0,
    )
    np.random.seed(0)
    y = pd.Series(np.random.binomial(1, 0.7, 100))
    X = transformer.fit_transform(df_normal_dist, y)
    X_t = [1.0, 0.71, 0.93, 0.0]

    # init params
    assert transformer.cv == 3
    assert transformer.variables == ["var"]
    assert transformer.scoring == "roc_auc"
    assert transformer.regression is False
    # fit params
    assert transformer.input_shape_ == (100, 1)
    # transform params
    assert len([x for x in np.round(X["var"].unique(), 2) if x not in X_t]) == 0
    assert transformer.scores_dict_ == {"var": 0.717391304347826}
Ejemplo n.º 2
0
def test_error_when_regression_is_false_and_target_is_continuous(
        df_discretise):
    np.random.seed(42)
    mu, sigma = 0, 3
    y = np.random.normal(mu, sigma, len(df_discretise))
    with pytest.raises(ValueError):
        transformer = DecisionTreeDiscretiser(regression=False)
        transformer.fit(df_discretise[["var_A", "var_B"]], y)
def test_regression(df_normal_dist):

    transformer = DecisionTreeDiscretiser(
        cv=3,
        scoring="neg_mean_squared_error",
        variables=None,
        param_grid={"max_depth": [1, 2, 3, 4]},
        regression=True,
        random_state=0,
    )
    np.random.seed(0)
    y = pd.Series(pd.Series(np.random.normal(0, 0.1, 100)))
    X = transformer.fit_transform(df_normal_dist, y)
    X_t = [
        0.19,
        0.04,
        0.11,
        0.23,
        -0.09,
        -0.02,
        0.01,
        0.15,
        0.07,
        -0.26,
        0.09,
        -0.07,
        -0.16,
        -0.2,
        -0.04,
        -0.12,
    ]

    # init params
    assert transformer.cv == 3
    assert transformer.variables is None
    assert transformer.scoring == "neg_mean_squared_error"
    assert transformer.regression is True
    # fit params
    assert transformer.variables_ == ["var"]
    assert transformer.n_features_in_ == 1
    assert np.round(transformer.scores_dict_["var"],
                    3) == np.round(-4.4373314584616444e-05, 3)
    # transform params
    assert all(x for x in np.round(X["var"].unique(), 2) if x not in X_t)
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Fit a decision tree per variable.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            categorical variables.

        y : pandas series.
            The target variable. Required to train the decision tree and for
            ordered ordinal encoding.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame.
            - If any user provided variable is not categorical
        ValueError
            - If there are no categorical variables in the df or the df is empty
            - If the variable(s) contain null values

        Returns
        -------
        self
        """

        # check input dataframe
        X = self._check_fit_input_and_variables(X)

        # initialize categorical encoder
        cat_encoder = OrdinalEncoder(encoding_method=self.encoding_method,
                                     variables=self.variables)

        # initialize decision tree discretiser
        tree_discretiser = DecisionTreeDiscretiser(
            cv=self.cv,
            scoring=self.scoring,
            variables=self.variables,
            param_grid=self.param_grid,
            regression=self.regression,
            random_state=self.random_state,
        )

        # pipeline for the encoder
        self.encoder_ = Pipeline([
            ("categorical_encoder", cat_encoder),
            ("tree_discretiser", tree_discretiser),
        ])

        self.encoder_.fit(X, y)

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 5
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learns the numbers that should be used to replace the categories in each
        variable.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            The target variable. Required to train the decision tree and for
            ordered ordinal encoding.
        """
        # check input dataframe
        X = self._check_fit_input_and_variables(X)

        # initialize categorical encoder
        cat_encoder = OrdinalEncoder(encoding_method=self.encoding_method,
                                     variables=self.variables)

        # initialize decision tree discretiser
        tree_discretiser = DecisionTreeDiscretiser(
            cv=self.cv,
            scoring=self.scoring,
            variables=self.variables,
            param_grid=self.param_grid,
            regression=self.regression,
            random_state=self.random_state,
        )

        # pipeline for the encoder
        self.encoder_ = Pipeline([
            ("categorical_encoder", cat_encoder),
            ("tree_discretiser", tree_discretiser),
        ])

        self.encoder_.fit(X, y)

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 6
0
import numpy as np
import pytest
from sklearn.utils.estimator_checks import check_estimator

from feature_engine.discretisation import (
    ArbitraryDiscretiser,
    DecisionTreeDiscretiser,
    EqualFrequencyDiscretiser,
    EqualWidthDiscretiser,
)
from tests.estimator_checks.estimator_checks import check_feature_engine_estimator

_estimators = [
    DecisionTreeDiscretiser(regression=False),
    EqualFrequencyDiscretiser(),
    EqualWidthDiscretiser(),
    ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}),
]


@pytest.mark.parametrize("estimator", _estimators)
def test_check_estimator_from_sklearn(estimator):
    return check_estimator(estimator)


@pytest.mark.parametrize("estimator", _estimators)
def test_check_estimator_from_feature_engine(estimator):
    if estimator.__class__.__name__ == "ArbitraryDiscretiser":
        estimator.set_params(binning_dict={"var_1": [-np.Inf, 0, np.Inf]})
    return check_feature_engine_estimator(estimator)
Ejemplo n.º 7
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Fit a decision tree per variable.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            categorical variables.

        y : pandas series.
            The target variable. Required to train the decision tree and for
            ordered ordinal encoding.
        """
        X, y = check_X_y(X, y)

        # confirm model type and target variables are compatible.
        if self.regression is True:
            if type_of_target(y) == "binary":
                raise ValueError(
                    "Trying to fit a regression to a binary target is not "
                    "allowed by this transformer. Check the target values "
                    "or set regression to False.")

        else:
            check_classification_targets(y)

        self._fit(X)
        self._get_feature_names_in(X)

        if self.param_grid:
            param_grid = self.param_grid
        else:
            param_grid = {"max_depth": [1, 2, 3, 4]}

        # initialize categorical encoder
        cat_encoder = OrdinalEncoder(
            encoding_method=self.encoding_method,
            variables=self.variables_,
            ignore_format=self.ignore_format,
            errors="raise",
        )

        # initialize decision tree discretiser
        tree_discretiser = DecisionTreeDiscretiser(
            cv=self.cv,
            scoring=self.scoring,
            variables=self.variables_,
            param_grid=param_grid,
            regression=self.regression,
            random_state=self.random_state,
        )

        # pipeline for the encoder
        self.encoder_ = Pipeline([
            ("categorical_encoder", cat_encoder),
            ("tree_discretiser", tree_discretiser),
        ])

        self.encoder_.fit(X, y)

        return self
def test_error_if_y_not_passed(df_normal_dist):
    # test case 3: raises error if target is not passed
    with pytest.raises(TypeError):
        encoder = DecisionTreeDiscretiser()
        encoder.fit(df_normal_dist)
def test_error_when_regression_is_not_bool():
    with pytest.raises(ValueError):
        DecisionTreeDiscretiser(regression="other")
def test_error_when_cv_is_string():
    with pytest.raises(ValueError):
        DecisionTreeDiscretiser(cv="other")
Ejemplo n.º 11
0
print("Separando em base de treino e teste...")
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df_full[features], df_full[target], random_state=42, test_size=0.1)

print("ok.")

# %%

print("Ajustando modelo em nosso pipeline...")
arbitrary_imputer = ArbitraryNumberImputer(arbitrary_number=-999,
                                           variables=features)

disc = DecisionTreeDiscretiser(cv=3,
                               scoring='roc_auc',
                               variables=features,
                               regression=False,
                               random_state=42)

pca = decomposition.PCA(n_components=120, random_state=42)

best_pars = {
    'subsample': 0.7,
    'n_estimators': 100,
    'max_depth': 5,
    'learning_rate': 0.2
}

clf_xgb = xgb.XGBClassifier(nthread=8,
                            eval_metric='auc',
                            random_state=42,
Ejemplo n.º 12
0
def test_error_when_regression_is_true_and_target_is_binary(df_discretise):
    with pytest.raises(ValueError):
        transformer = DecisionTreeDiscretiser(regression=True)
        transformer.fit(df_discretise[["var_A", "var_B"]],
                        df_discretise["target"])
import numpy as np
import pytest
from sklearn.utils.estimator_checks import check_estimator

from feature_engine.discretisation import (
    ArbitraryDiscretiser,
    DecisionTreeDiscretiser,
    EqualFrequencyDiscretiser,
    EqualWidthDiscretiser,
)


@pytest.mark.parametrize(
    "Estimator",
    [
        DecisionTreeDiscretiser(),
        EqualFrequencyDiscretiser(),
        EqualWidthDiscretiser(),
        ArbitraryDiscretiser(binning_dict={"0": [-np.Inf, 0, np.Inf]}),
    ],
)
def test_all_transformers(Estimator):
    return check_estimator(Estimator)