Ejemplo n.º 1
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This method does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        # create the imputer dictionary
        if self.imputer_dict:
            self.variables_ = _find_or_check_numerical_variables(
                X,
                self.imputer_dict.keys()  # type: ignore
            )
            self.imputer_dict_ = self.imputer_dict
        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)
            self.imputer_dict_ = {
                var: self.arbitrary_number
                for var in self.variables_
            }

        self._get_feature_names_in(X)

        return self
Ejemplo n.º 2
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter. Performs dataframe checks.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Default=None.
            It is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
           - If the input is not a Pandas DataFrame
           - If any user provided variables are not numerical
        ValueError
           If any of the reference variables contain null values and the
           mathematical operation is 'div'.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check reference_variables are numerical
        self.reference_variables = _find_or_check_numerical_variables(
            X, self.reference_variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer with div.")

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 3
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This method does not learn any parameter. Checks dataframe and finds numerical
        variables, or checks that the variables entered by user are numerical.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        if self.imputer_dict:
            self.variables = _find_or_check_numerical_variables(
                X,
                self.imputer_dict.keys()  # type: ignore
            )
        else:
            self.variables = _find_or_check_numerical_variables(
                X, self.variables)

        # create the imputer dictionary
        if self.imputer_dict:
            self.imputer_dict_ = self.imputer_dict
        else:
            self.imputer_dict_ = {
                var: self.arbitrary_number
                for var in self.variables
            }

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 4
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the mean or median values.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # find imputation parameters: mean or median
        if self.imputation_method == "mean":
            self.imputer_dict_ = X[self.variables_].mean().to_dict()

        elif self.imputation_method == "median":
            self.imputer_dict_ = X[self.variables_].median().to_dict()

        self._get_feature_names_in(X)

        return self
Ejemplo n.º 5
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Common set-up of creation transformers.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y: pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables are numerical
        self.variables: List[Union[str,
                                   int]] = _find_or_check_numerical_variables(
                                       X, self.variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables)
            _check_contains_inf(X, self.variables)

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return X
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this transformer. You can pass None or y.
        """
        # check input dataframe
        X = check_X(X)

        # We need the dataframes to have unique values in the index and no missing data.
        # Otherwise, when we merge the new features we will duplicate rows.
        self._check_index(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            self._check_na_and_inf(X)

        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Default=None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check reference_variables are numerical
        self.reference_variables = _find_or_check_numerical_variables(
            X, self.reference_variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer with div.")

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 8
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Perform dataframe checks. Creates dictionary of operation to new feature
        name pairs.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        if self.math_operations is None:
            self.math_operations_ = [
                "sum", "prod", "mean", "std", "max", "min"
            ]
        else:
            self.math_operations_ = self.math_operations

        # dictionary of new_variable_name to operation pairs
        if self.new_variables_names:
            self.combination_dict_ = dict(
                zip(self.new_variables_names, self.math_operations_))
        else:
            if all(isinstance(var, str) for var in self.variables_to_combine):
                vars_ls = self.variables_to_combine
            else:
                vars_ls = [str(var) for var in self.variables_to_combine]

            self.combination_dict_ = {
                f"{operation}({'-'.join(vars_ls)})": operation  # type: ignore
                for operation in self.math_operations_
            }

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 9
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the values at the end of the variable distribution.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty

        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # estimate imputation values
        if self.imputation_method == "max":
            self.imputer_dict_ = (X[self.variables].max() * self.fold).to_dict()

        elif self.imputation_method == "gaussian":
            if self.tail == "right":
                self.imputer_dict_ = (
                    X[self.variables].mean() + self.fold * X[self.variables].std()
                ).to_dict()
            elif self.tail == "left":
                self.imputer_dict_ = (
                    X[self.variables].mean() - self.fold * X[self.variables].std()
                ).to_dict()

        elif self.imputation_method == "iqr":
            IQR = X[self.variables].quantile(0.75) - X[self.variables].quantile(0.25)
            if self.tail == "right":
                self.imputer_dict_ = (
                    X[self.variables].quantile(0.75) + (IQR * self.fold)
                ).to_dict()
            elif self.tail == "left":
                self.imputer_dict_ = (
                    X[self.variables].quantile(0.25) - (IQR * self.fold)
                ).to_dict()

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 10
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        Fits the Scikit-learn transformer to the selected variables.

        If you enter None in the variables parameter, all variables will be
        automatically transformed by the OneHotEncoder, OrdinalEncoder or
        SimpleImputer. For the rest of the transformers, only the numerical variables
        will be selected and transformed.

        If you enter a list in the variables attribute, the SklearnTransformerWrapper
        will check that those variables exist in the dataframe and are of type
        numeric for all transformers except the OneHotEncoder, OrdinalEncoder or
        SimpleImputer, which also accept categorical variables.

        Parameters
        ----------
        X: Pandas DataFrame
            The dataset to fit the transformer
        y: pandas Series, default=None
            The target variable.

        Raises
        ------
         TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        self.transformer_ = clone(self.transformer)

        if (self.transformer_.__class__.__name__ == "OneHotEncoder"
                and self.transformer_.sparse):
            raise AttributeError(
                "The SklearnTransformerWrapper can only wrap the OneHotEncoder if you "
                "set its sparse attribute to False")

        if self.transformer_.__class__.__name__ in [
                "OneHotEncoder",
                "OrdinalEncoder",
                "SimpleImputer",
        ]:
            self.variables_ = _find_all_variables(X, self.variables)

        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer_.fit(X[self.variables_], y)

        self.n_features_in_ = X.shape[1]

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Select features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        self.feature_performance_ = {}

        # train a model for every feature and store the performance
        for feature in self.variables:
            model = cross_validate(
                self.estimator,
                X[feature].to_frame(),
                y,
                cv=self.cv,
                return_estimator=False,
                scoring=self.scoring,
            )

            self.feature_performance_[feature] = model["test_score"].mean()

        # select features
        if not self.threshold:
            threshold = pd.Series(self.feature_performance_).mean()
        else:
            threshold = self.threshold

        self.features_to_drop_ = [
            f for f in self.feature_performance_.keys()
            if self.feature_performance_[f] < threshold
        ]

        # check we are not dropping all the columns in the df
        if len(self.features_to_drop_) == len(X.columns):
            warnings.warn(
                "All features will be dropped, try changing the threshold.")

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 12
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y: pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """
        X = _is_dataframe(X)

        # find variables to be capped
        if self.min_capping_dict is None and self.max_capping_dict:
            self.variables_ = [x for x in self.max_capping_dict.keys()]
        elif self.max_capping_dict is None and self.min_capping_dict:
            self.variables_ = [x for x in self.min_capping_dict.keys()]
        elif self.min_capping_dict and self.max_capping_dict:
            tmp = self.min_capping_dict.copy()
            tmp.update(self.max_capping_dict)
            self.variables_ = [x for x in tmp.keys()]

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        if self.max_capping_dict is not None:
            self.right_tail_caps_ = self.max_capping_dict
        else:
            self.right_tail_caps_ = {}

        if self.min_capping_dict is not None:
            self.left_tail_caps_ = self.min_capping_dict
        else:
            self.left_tail_caps_ = {}

        self.n_features_in_ = X.shape[1]

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # list to collect selected features
        self.selected_features_ = []

        self.feature_performance_ = {}

        # train a model for every feature
        for feature in self.variables:
            model = cross_validate(
                self.estimator,
                X[feature].to_frame(),
                y,
                cv=self.cv,
                return_estimator=False,
                scoring=self.scoring,
            )

            if model["test_score"].mean() > self.threshold:
                self.selected_features_.append(feature)

            self.feature_performance_[feature] = model["test_score"].mean()

        # check we are not dropping all the columns in the df
        if len(self.selected_features_) == 0:
            raise ValueError(
                "No features were selected, try changing the threshold.")

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 14
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        The `fit` method allows Scikit-learn transformers to learn the required
        parameters from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer,
        all variables indicated in the ```variables``` parameter will be transformed.
        When the variables parameter is None, the SklearnWrapper will automatically
        select and transform all features in the dataset, numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables will be
        transformed. The SklearnWrapper will check that the variables indicated in the
        variables parameter are numerical, or alternatively, if variables is None, it
        will automatically select the numerical variables in the data set.

        Parameters
        ----------
        X : Pandas DataFrame
            The dataset to fit the transformer
        y : pandas Series, default=None
            This parameter exists only for compatibility with sklearn.pipeline.Pipeline.

        Raises
        ------
         TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer,
                      (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)

        else:
            self.variables = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 15
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        Fits the Scikit-learn transformer to the selected variables.

        Parameters
        ----------
        X: Pandas DataFrame
            The dataset to fit the transformer.

        y: pandas Series, default=None
            The target variable.
        """

        # check input dataframe
        X = check_X(X)

        self.transformer_ = clone(self.transformer)

        if self.transformer_.__class__.__name__ in [
                "OneHotEncoder",
                "OrdinalEncoder",
                "SimpleImputer",
                "FunctionTransformer",
        ]:
            self.variables_ = _find_all_variables(X, self.variables)

        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer_.fit(X[self.variables_], y)

        if self.transformer_.__class__.__name__ in _SELECTORS:
            # Find features to drop.
            selected = X[self.variables_].columns[
                self.transformer_.get_support()]
            self.features_to_drop_ = [
                f for f in self.variables_ if f not in selected
            ]

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 16
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Check dataframe and variables. Checks that the user entered variables are in
        the train set and cast as numerical.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset. Can be the entire dataframe, not just the
            variables to be transformed.

        y: None
            y is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            - If there are no numerical variables in the df or the df is empty
            - If the variable(s) contain null values

        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        variables = [x for x in self.binning_dict.keys()]
        self.variables_ = _find_or_check_numerical_variables(X, variables)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        # for consistency wit the rest of the discretisers, we add this attribute
        self.binner_dict_ = self.binning_dict

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 17
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the values at the end of the variable distribution.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.
        """
        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # estimate imputation values
        if self.imputation_method == "max":
            self.imputer_dict_ = (X[self.variables_].max() *
                                  self.fold).to_dict()

        elif self.imputation_method == "gaussian":
            if self.tail == "right":
                self.imputer_dict_ = (
                    X[self.variables_].mean() +
                    self.fold * X[self.variables_].std()).to_dict()
            elif self.tail == "left":
                self.imputer_dict_ = (
                    X[self.variables_].mean() -
                    self.fold * X[self.variables_].std()).to_dict()

        elif self.imputation_method == "iqr":
            IQR = X[self.variables_].quantile(0.75) - X[
                self.variables_].quantile(0.25)
            if self.tail == "right":
                self.imputer_dict_ = (X[self.variables_].quantile(0.75) +
                                      (IQR * self.fold)).to_dict()
            elif self.tail == "left":
                self.imputer_dict_ = (X[self.variables_].quantile(0.25) -
                                      (IQR * self.fold)).to_dict()

        self._get_feature_names_in(X)

        return self
Ejemplo n.º 18
0
    def fit(self,
            X: pd.DataFrame,
            y: Optional[pd.Series] = None) -> pd.DataFrame:
        """
        Checks that input is a dataframe, finds numerical variables, or alternatively
        checks that variables entered by the user are of type numerical.

        Parameters
        ----------
        X : Pandas DataFrame

        y : Pandas Series, np.array. Default = None
            Parameter is necessary for compatibility with sklearn Pipeline.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame or a numpy array
            If any of the user provided variables are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty
            If the variable(s) contain null values

        Returns
        -------
        X : Pandas DataFrame
            The same dataframe entered as parameter
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables_: List[Union[str,
                                    int]] = _find_or_check_numerical_variables(
                                        X, self.variables)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        return X
def test_find_or_check_numerical_variables(df_vartypes, df_numeric_columns):
    vars_num = ["Age", "Marks"]
    var_num = "Age"
    vars_mix = ["Age", "Marks", "Name"]
    vars_none = None

    assert _find_or_check_numerical_variables(df_vartypes, vars_num) == vars_num
    assert _find_or_check_numerical_variables(df_vartypes, var_num) == ["Age"]
    assert _find_or_check_numerical_variables(df_vartypes, vars_none) == vars_num

    with pytest.raises(TypeError):
        assert _find_or_check_numerical_variables(df_vartypes, vars_mix)

    with pytest.raises(ValueError):
        assert _find_or_check_numerical_variables(df_vartypes[["Name", "City"]], None)

    assert _find_or_check_numerical_variables(df_numeric_columns, [2, 3]) == [2, 3]
    assert _find_or_check_numerical_variables(df_numeric_columns, 2) == [2]
Ejemplo n.º 20
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Default=None.
            It is not needed in this transformer. You can pass y or None.
        """
        # Common checks and attributes
        X = super().fit(X, y)

        # check variables are numerical
        self.reference = _find_or_check_numerical_variables(X, self.reference)

        return self
Ejemplo n.º 21
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the mean or median values.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # find imputation parameters: mean or median
        if self.imputation_method == "mean":
            self.imputer_dict_ = X[self.variables_].mean().to_dict()

        elif self.imputation_method == "median":
            self.imputer_dict_ = X[self.variables_].median().to_dict()

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 22
0
    def _select_variables_from_dict(self, X: pd.DataFrame,
                                    user_dict_: Dict) -> pd.DataFrame:
        """
        Checks that input is a dataframe, checks that variables in the dictionary
        entered by the user are of type numerical.

        Parameters
        ----------
        X : Pandas DataFrame

        user_dict_ : Dictionary. Default = None
            Any dictionary allowed by the transformer and entered by user.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame or a numpy array
            If any of the variables in the dictionary are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty
            If the variable(s) contain null values

        Returns
        -------
        X : Pandas DataFrame
            The same dataframe entered as parameter
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        variables = [x for x in user_dict_.keys()]
        self.variables_ = _find_or_check_numerical_variables(X, variables)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        return X
Ejemplo n.º 23
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find features with high PSI values.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check those entered are present in the dataframe
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # Remove the split_col from the variables list. It might be added if the
        # variables are not defined at initialization.
        if self.split_col in self.variables_:
            self.variables_.remove(self.split_col)

        if self.missing_values == "raise":
            # check if dataset contains na or inf
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # Split the dataframe into basis and test.
        basis_df, test_df = self._split_dataframe(X)

        # Check the shape of the returned dataframes for PSI calculations.
        # The number of observations must be at least equal to the
        # number of bins.
        if min(basis_df.shape[0], test_df.shape[0]) < self.bins:
            raise ValueError(
                "The number of rows in the basis and test datasets that will be used "
                f"in the PSI calculations must be at least larger than {self.bins}. "
                "After slitting the original dataset based on the given cut_off or"
                f"split_frac we have {basis_df.shape[0]} samples in the basis set, "
                f"and {test_df.shape[0]} samples in the test set. "
                "Please adjust the value of the cut_off or split_frac.")

        # Switch basis and test dataframes if required.
        if self.switch:
            test_df, basis_df = basis_df, test_df

        # set up the discretizer
        if self.strategy == "equal_width":
            bucketer = EqualWidthDiscretiser(bins=self.bins)
        else:
            bucketer = EqualFrequencyDiscretiser(q=self.bins)

        # Compute the PSI by looping over the features
        self.psi_values_ = {}
        self.features_to_drop_ = []

        for feature in self.variables_:
            # Discretize the features.

            basis_discrete = bucketer.fit_transform(basis_df[[feature
                                                              ]].dropna())
            test_discrete = bucketer.transform(test_df[[feature]].dropna())

            # Determine percentage of observations per bin
            basis_distrib, test_distrib = self._observation_frequency_per_bin(
                basis_discrete, test_discrete)

            # Calculate the PSI value
            self.psi_values_[feature] = np.sum(
                (test_distrib - basis_distrib) *
                np.log(test_distrib / basis_distrib))
            # Assess if feature should be dropped
            if self.psi_values_[feature] > self.threshold:
                self.features_to_drop_.append(feature)

        # save input features
        self._get_feature_names_in(X)

        return self
Ejemplo n.º 24
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe
        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # reset the index
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X,
            y,
            cv=self.cv,
            return_estimator=True,
            scoring=self.scoring,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # get performance metric
        scorer = get_scorer(self.scoring)

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # list to collect selected features
        self.selected_features_ = []

        # shuffle features and save feature performance drift into a dict
        for feature in self.variables:

            X_shuffled = X.copy()

            # shuffle individual feature
            X_shuffled[feature] = (
                X_shuffled[feature].sample(frac=1).reset_index(drop=True)
            )

            # determine the performance with the shuffled feature
            performance = np.mean(
                [scorer(m, X_shuffled, y) for m in model["estimator"]]
            )

            # determine drift in performance
            # Note, sklearn negates the log and error scores, so no need to manually
            # do the invertion
            # https://scikit-learn.org/stable/modules/model_evaluation.html
            # (https://scikit-learn.org/stable/modules/model_evaluation.html
            # #the-scoring-parameter-defining-model-evaluation-rules)
            performance_drift = self.initial_model_performance_ - performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

        # select features
        for feature in self.performance_drifts_.keys():

            if self.performance_drifts_[feature] > self.threshold:

                self.selected_features_.append(feature)

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 25
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # set to collect features that are correlated
        self.correlated_features_ = set()

        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        self.correlated_matrix_ = X[self.variables].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in self.correlated_matrix_.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in self.correlated_matrix_.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(self.correlated_matrix_.loc[
                            f2, feature]) > self.threshold:

                        # add feature (f2) to our correlated set
                        self.correlated_features_.add(f2)
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Perform dataframe checks. Creates dictionary of operation to new feature
        name pairs.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y : pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
           - If the input is not a Pandas DataFrame
           - If any user provided variables in variables_to_combine are not numerical
        ValueError
           If the variable(s) contain null values

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine
        )

        # check if dataset contains na
        _check_contains_na(X, self.variables_to_combine)

        if self.math_operations is None:
            self.math_operations_ = ["sum", "prod", "mean", "std", "max", "min"]
        else:
            self.math_operations_ = self.math_operations

        # dictionary of new_variable_name to operation pairs
        if self.new_variables_names:
            self.combination_dict_ = dict(
                zip(self.new_variables_names, self.math_operations_)
            )
        else:
            if all(isinstance(var, str) for var in self.variables_to_combine):
                vars_ls = self.variables_to_combine
            else:
                vars_ls = [str(var) for var in self.variables_to_combine]

            self.combination_dict_ = {
                f"{operation}({'-'.join(vars_ls)})": operation  # type: ignore
                for operation in self.math_operations_
            }

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 27
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features. Note that the selector trains various models at
        each round of selection, so it might take a while.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe
        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.


        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables],
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # Initialize a dataframe that will contain the list of the feature/coeff
        # importance for each cross validation fold
        feature_importances_cv = pd.DataFrame()

        # Populate the feature_importances_cv dataframe with columns containing
        # the feature importance values for each model returned by the cross
        # validation.
        # There are as many columns as folds.
        for m in model["estimator"]:

            feature_importances_cv[m] = get_feature_importances(m)

        # Add the variables as index to feature_importances_cv
        feature_importances_cv.index = self.variables

        # Aggregate the feature importance returned in each fold
        self.feature_importances_ = feature_importances_cv.mean(axis=1)

        # Sort the feature importance values
        self.feature_importances_.sort_values(ascending=True, inplace=True)

        # to collect selected features
        _selected_features = []

        # temporary copy where we will remove features recursively
        X_tmp = X[self.variables].copy()

        # we need to update the performance as we remove features
        baseline_model_performance = self.initial_model_performance_

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # evaluate every feature, starting from the least important
        # remember that feature_importances_ is ordered already
        for feature in list(self.feature_importances_.index):

            # remove feature and train new model
            model_tmp = cross_validate(
                self.estimator,
                X_tmp.drop(columns=feature),
                y,
                cv=self.cv,
                scoring=self.scoring,
                return_estimator=False,
            )

            # assign new model performance
            model_tmp_performance = model_tmp["test_score"].mean()

            # Calculate performance drift
            performance_drift = baseline_model_performance - model_tmp_performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

            if performance_drift > self.threshold:

                _selected_features.append(feature)

            else:
                # remove feature and adjust initial performance
                X_tmp = X_tmp.drop(columns=feature)

                baseline_model = cross_validate(
                    self.estimator,
                    X_tmp,
                    y,
                    cv=self.cv,
                    return_estimator=False,
                    scoring=self.scoring,
                )

                # store initial model performance
                baseline_model_performance = baseline_model["test_score"].mean(
                )

        self.features_to_drop_ = [
            f for f in self.variables if f not in _selected_features
        ]

        self.input_shape_ = X.shape

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated feature groups. Determine which feature should be selected
        from each group.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series. Default = None
            y is needed if selection_method == 'model_performance'.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        if self.selection_method == "model_performance" and y is None:
            raise ValueError("y is needed to fit the transformer")

        # FIND CORRELATED FEATURES
        # ========================
        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        _correlated_matrix = X[self.variables_].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in _correlated_matrix.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in _correlated_matrix.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(_correlated_matrix.loc[f2,
                                                  feature]) > self.threshold:
                        # add feature (f2) to our correlated set
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        # SELECT 1 FEATURE FROM EACH GROUP
        # ================================

        # list to collect selected features
        # we start it with all features that were either not examined, i.e., categorical
        # variables, or not found correlated
        _selected_features = [
            f for f in X.columns
            if f not in set().union(*self.correlated_feature_sets_)
        ]

        # select the feature with least missing values
        if self.selection_method == "missing_values":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].isnull().sum().sort_values(
                    ascending=True).index[0]
                _selected_features.append(f)

        # select the feature with most unique values
        elif self.selection_method == "cardinality":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].nunique().sort_values(
                    ascending=False).index[0]
                _selected_features.append(f)

        # select the feature with biggest variance
        elif self.selection_method == "variance":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].std().sort_values(
                    ascending=False).index[0]
                _selected_features.append(f)

        # select best performing feature according to estimator
        else:
            for feature_group in self.correlated_feature_sets_:

                # feature_group = list(feature_group)
                temp_perf = []

                # train a model for every feature
                for feature in feature_group:
                    model = cross_validate(
                        self.estimator,
                        X[feature].to_frame(),
                        y,
                        cv=self.cv,
                        return_estimator=False,
                        scoring=self.scoring,
                    )

                    temp_perf.append(model["test_score"].mean())

                # select best performing feature from group
                f = list(feature_group)[temp_perf.index(max(temp_perf))]
                _selected_features.append(f)

        self.features_to_drop_ = [
            f for f in self.variables_ if f not in _selected_features
        ]

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 29
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the values that should be used to replace outliers.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        self.right_tail_caps_ = {}
        self.left_tail_caps_ = {}

        # estimate the end values
        if self.tail in ["right", "both"]:
            if self.capping_method == "gaussian":
                self.right_tail_caps_ = (
                    X[self.variables_].mean() +
                    self.fold * X[self.variables_].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables_].quantile(0.75) - X[
                    self.variables_].quantile(0.25)
                self.right_tail_caps_ = (X[self.variables_].quantile(0.75) +
                                         (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.right_tail_caps_ = (
                    X[self.variables_].quantile(1 - self.fold).to_dict())

        if self.tail in ["left", "both"]:
            if self.capping_method == "gaussian":
                self.left_tail_caps_ = (
                    X[self.variables_].mean() -
                    self.fold * X[self.variables_].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables_].quantile(0.75) - X[
                    self.variables_].quantile(0.25)
                self.left_tail_caps_ = (X[self.variables_].quantile(0.25) -
                                        (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.left_tail_caps_ = X[self.variables_].quantile(
                    self.fold).to_dict()

        self.feature_names_in_ = X.columns.to_list()
        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 30
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features. Note that the selector trains various models at
        each round of selection, so it might take a while.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables],
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # Initialize a dataframe that will contain the list of the feature/coeff
        # importance for each cross validation fold
        feature_importances_cv = pd.DataFrame()

        # Populate the feature_importances_cv dataframe with columns containing
        # the feature importance values for each model returned by the cross
        # validation.
        # There are as many columns as folds.
        for m in model["estimator"]:

            feature_importances_cv[m] = get_feature_importances(m)

        # Add the variables as index to feature_importances_cv
        feature_importances_cv.index = self.variables

        # Aggregate the feature importance returned in each fold
        self.feature_importances_ = feature_importances_cv.mean(axis=1)

        # Sort the feature importance values decreasingly
        self.feature_importances_.sort_values(ascending=False, inplace=True)

        # Extract most important feature from the ordered list of features
        first_most_important_feature = list(self.feature_importances_.index)[0]

        # Run baseline model using only the most important feature
        baseline_model = cross_validate(
            self.estimator,
            X[first_most_important_feature].to_frame(),
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # Save baseline model performance
        baseline_model_performance = baseline_model["test_score"].mean()

        # list to collect selected features
        # It is initialized with the most important feature
        _selected_features = [first_most_important_feature]

        # dict to collect features and their performance_drift
        # It is initialized with the performance drift of
        # the most important feature
        self.performance_drifts_ = {first_most_important_feature: 0}

        # loop over the ordered list of features by feature importance starting
        # from the second element in the list.
        for feature in list(self.feature_importances_.index)[1:]:

            # Add feature and train new model
            model_tmp = cross_validate(
                self.estimator,
                X[_selected_features + [feature]],
                y,
                cv=self.cv,
                scoring=self.scoring,
                return_estimator=True,
            )

            # assign new model performance
            model_tmp_performance = model_tmp["test_score"].mean()

            # Calculate performance drift
            performance_drift = model_tmp_performance - baseline_model_performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

            # If new performance model is
            if performance_drift > self.threshold:

                # add feature to the list of selected features
                _selected_features.append(feature)

                # Update new baseline model performance
                baseline_model_performance = model_tmp_performance

        self.features_to_drop_ = [
            f for f in self.variables if f not in _selected_features
        ]

        self.input_shape_ = X.shape

        return self