def test_check_X_converts_numpy_to_pandas():
    a1D = np.array([1, 2, 3, 4])
    a2D = np.array([[1, 2], [3, 4]])
    a3D = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

    df_2D = pd.DataFrame(a2D, columns=["0", "1"])
    assert_frame_equal(df_2D, check_X(a2D))

    with pytest.raises(ValueError):
        check_X(a3D)
    with pytest.raises(ValueError):
        check_X(a1D)
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Common set-up of creation transformers.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y: pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables are numerical
        self.variables: List[Union[str,
                                   int]] = _find_or_check_numerical_variables(
                                       X, self.variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables)
            _check_contains_inf(X, self.variables)

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return X
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Return dataframe with selected features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features].
            The input dataframe.

        Returns
        -------
        X_new: pandas dataframe of shape = [n_samples, n_selected_features]
            Pandas dataframe with the selected features.
        """

        # check if fit is performed prior to transform
        check_is_fitted(self)

        # check if input is a dataframe
        X = check_X(X)

        # check if number of columns in test dataset matches to train dataset
        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder df to match train set
        X = X[self.feature_names_in_]

        # return the dataframe with the selected features
        return X.drop(columns=self.features_to_drop_)
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the encodings or levels to use for representing categorical variables.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset. Can be the entire dataframe, not just the
            variables to be transformed.

        y: pandas Series, default = None
            y is not needed in this encoder. You can pass y or None.
        """
        X = check_X(X)
        self._check_or_select_variables(X)

        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_)

        self._get_feature_names_in(X)

        self.category_dict_ = dict()
        for var in self.variables_:
            self.category_dict_[var] = pd.Categorical(X[var]).categories

        return self
Exemple #5
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This method does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        # create the imputer dictionary
        if self.imputer_dict:
            self.variables_ = _find_or_check_numerical_variables(
                X,
                self.imputer_dict.keys()  # type: ignore
            )
            self.imputer_dict_ = self.imputer_dict
        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)
            self.imputer_dict_ = {
                var: self.arbitrary_number
                for var in self.variables_
            }

        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this transformer. You can pass None or y.
        """
        # check input dataframe
        X = check_X(X)

        # We need the dataframes to have unique values in the index and no missing data.
        # Otherwise, when we merge the new features we will duplicate rows.
        self._check_index(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            self._check_na_and_inf(X)

        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the variables for which the missing indicators will be created.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables for which indicator should be added
        self.variables_ = _find_all_variables(X, self.variables)

        if self.missing_only is True:
            self.variables_ = [
                var for var in self.variables_ if X[var].isnull().sum() > 0
            ]

        self._get_feature_names_in(X)

        return self
    def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common checks before transforming data:

        - Check transformer was fit
        - Check that the input is a dataframe
        - Check that input has same size than the train set used in fit()
        - Re-orders dataframe features if necessary

        Parameters
        ----------
        X: Pandas DataFrame

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered by the user.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check that input df contains same number of columns as df used to fit
        _check_X_matches_training_df(X, self.n_features_in_)

        # reorder df to match train set
        X = X[self.feature_names_in_]

        return X
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the counts or frequencies which will be used to replace the categories.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset. Can be the entire dataframe, not just the
            variables to be transformed.

        y: pandas Series, default = None
            y is not needed in this encoder. You can pass y or None.
        """
        X = check_X(X)
        self._fit(X)
        self._get_feature_names_in(X)

        self.encoder_dict_ = {}
        dct_init = defaultdict(lambda: 0) if self.errors == "encode" else {}

        # learn encoding maps
        for var in self.variables_:
            if self.encoding_method == "count":
                self.encoder_dict_[var] = X[var].value_counts().to_dict(dct_init)

            elif self.encoding_method == "frequency":
                self.encoder_dict_[var] = (
                    X[var].value_counts(normalize=True).to_dict(dct_init)
                )

        self._check_encoding_dictionary()

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Find the variables for which missing data should be evaluated to decide if a
        row should be dropped.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training data set.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables for which indicator should be added
        self.variables_ = _find_all_variables(X, self.variables)

        # If user passes a threshold, then missing_only is ignored:
        if self.threshold is None and self.missing_only is True:
            self.variables_ = [
                var for var in self.variables_ if X[var].isnull().sum() > 0
            ]

        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the mean or median values.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # find imputation parameters: mean or median
        if self.imputation_method == "mean":
            self.imputer_dict_ = X[self.variables_].mean().to_dict()

        elif self.imputation_method == "median":
            self.imputer_dict_ = X[self.variables_].median().to_dict()

        self._get_feature_names_in(X)

        return self
Exemple #12
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The input dataframe
        y : pandas Series, default = None
            y is not needed for this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # X[self.features_to_drops] calls to pandas to check if columns are
        # present in the df.
        X[self.features_to_drop]

        self.features_to_drop_ = self.features_to_drop

        # check user is not removing all columns in the dataframe
        if len(self.features_to_drop) == len(X.columns):
            raise ValueError(
                "The resulting dataframe will have no columns after dropping all "
                "existing variables"
            )

        # save input features
        self._get_feature_names_in(X)

        return self
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common input and transformer checks.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables)
            _check_contains_inf(X, self.variables)

        # reorder variables to match train set
        X = X[self.feature_names_in_]

        return X
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find constant and quasi-constant features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.
        y: None
            y is not needed for this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find all variables or check those entered are present in the dataframe
        self.variables_ = _find_all_variables(X, self.variables_)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)

        if self.missing_values == "include":
            X[self.variables_] = X[self.variables_].fillna("missing_values")

        # find constant features
        if self.tol == 1:
            self.features_to_drop_ = [
                feature for feature in self.variables_
                if X[feature].nunique() == 1
            ]

        # find constant and quasi-constant features
        else:
            self.features_to_drop_ = []

            for feature in self.variables_:
                # find most frequent value / category in the variable
                predominant = ((X[feature].value_counts() /
                                float(len(X))).sort_values(
                                    ascending=False).values[0])

                if predominant >= self.tol:
                    self.features_to_drop_.append(feature)

        # check we are not dropping all the columns in the df
        if len(self.features_to_drop_) == len(X.columns):
            raise ValueError(
                "The resulting dataframe will have no columns after dropping all "
                "constant or quasi-constant features. Try changing the tol value."
            )

        # save input features
        self._get_feature_names_in(X)

        return self
Exemple #15
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Perform dataframe checks. Creates dictionary of operation to new feature
        name pairs.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        if self.math_operations is None:
            self.math_operations_ = [
                "sum", "prod", "mean", "std", "max", "min"
            ]
        else:
            self.math_operations_ = self.math_operations

        # dictionary of new_variable_name to operation pairs
        if self.new_variables_names:
            self.combination_dict_ = dict(
                zip(self.new_variables_names, self.math_operations_))
        else:
            if all(isinstance(var, str) for var in self.variables_to_combine):
                vars_ls = self.variables_to_combine
            else:
                vars_ls = [str(var) for var in self.variables_to_combine]

            self.combination_dict_ = {
                f"{operation}({'-'.join(vars_ls)})": operation  # type: ignore
                for operation in self.math_operations_
            }

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Exemple #16
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Finds datetime variables or checks that the variables selected by the user
        can be converted to datetime.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, default=None
            It is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # special case index
        if self.variables == "index":

            if not (
                is_datetime(X.index)
                or (
                    not is_numeric(X.index) and _is_categorical_and_is_datetime(X.index)
                )
            ):
                raise TypeError("The dataframe index is not datetime.")

            if self.missing_values == "raise":
                self._check_index_contains_na(X.index)

            self.variables_ = None

        else:
            # find or check for datetime variables
            self.variables_ = _find_or_check_datetime_variables(X, self.variables)

            # check if datetime variables contains na
            if self.missing_values == "raise":
                _check_contains_na(X, self.variables_)

        if self.features_to_extract is None:
            self.features_to_extract_ = FEATURES_DEFAULT
        elif isinstance(self.features_to_extract, str):
            self.features_to_extract_ = FEATURES_SUPPORTED
        else:
            self.features_to_extract_ = self.features_to_extract

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Cap the variable values. Optionally, add outlier indicators.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to be transformed.

        Returns
        -------
        X_new: pandas dataframe of shape = [n_samples, n_features + n_ind]
            The dataframe with the capped variables and indicators.
            The number of output variables depends on the values for 'tail' and
            'add_indicators': if passing 'add_indicators=False', will be equal
            to 'n_features', otherwise, will have an additional indicator column
            per processed feature for each tail.
        """
        if not self.add_indicators:
            X_out = super().transform(X)

        else:
            X_orig = check_X(X)
            X_out = super().transform(X_orig)
            X_orig = X_orig[self.variables_]
            X_out_filtered = X_out[self.variables_]

            if self.tail in ["left", "both"]:
                X_left = X_out_filtered > X_orig
                X_left.columns = [str(cl) + "_left" for cl in self.variables_]
            if self.tail in ["right", "both"]:
                X_right = X_out_filtered < X_orig
                X_right.columns = [
                    str(cl) + "_right" for cl in self.variables_
                ]
            if self.tail == "left":
                X_out = pd.concat([X_out, X_left.astype(np.float64)], axis=1)
            elif self.tail == "right":
                X_out = pd.concat([X_out, X_right.astype(np.float64)], axis=1)
            else:
                X_both = pd.concat([X_left, X_right],
                                   axis=1).astype(np.float64)
                X_both = X_both[[
                    cl1 for cl2 in zip(X_left.columns.values,
                                       X_right.columns.values) for cl1 in cl2
                ]]
                X_out = pd.concat([X_out, X_both], axis=1)

        return X_out
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the frequent categories for each variable.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just selected
            variables

        y: None
            y is not required. You can pass y or None.
        """

        X = check_X(X)
        self._fit(X)
        self._get_feature_names_in(X)

        self.encoder_dict_ = {}

        for var in self.variables_:
            if len(X[var].unique()) > self.n_categories:

                # if the variable has more than the indicated number of categories
                # the encoder will learn the most frequent categories
                t = pd.Series(X[var].value_counts() / float(len(X)))

                # non-rare labels:
                freq_idx = t[t >= self.tol].index

                if self.max_n_categories:
                    self.encoder_dict_[var] = freq_idx[:self.max_n_categories]
                else:
                    self.encoder_dict_[var] = freq_idx

            else:
                # if the total number of categories is smaller than the indicated
                # the encoder will consider all categories as frequent.
                warnings.warn(
                    "The number of unique categories for variable {} is less than that "
                    "indicated in n_categories. Thus, all categories will be "
                    "considered frequent".format(var))
                self.encoder_dict_[var] = X[var].unique()

        self._check_encoding_dictionary()

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Default=None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check reference_variables are numerical
        self.reference_variables = _find_or_check_numerical_variables(
            X, self.reference_variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer with div.")

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Exemple #20
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the values at the end of the variable distribution.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.
        """
        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        # estimate imputation values
        if self.imputation_method == "max":
            self.imputer_dict_ = (X[self.variables_].max() *
                                  self.fold).to_dict()

        elif self.imputation_method == "gaussian":
            if self.tail == "right":
                self.imputer_dict_ = (
                    X[self.variables_].mean() +
                    self.fold * X[self.variables_].std()).to_dict()
            elif self.tail == "left":
                self.imputer_dict_ = (
                    X[self.variables_].mean() -
                    self.fold * X[self.variables_].std()).to_dict()

        elif self.imputation_method == "iqr":
            IQR = X[self.variables_].quantile(0.75) - X[
                self.variables_].quantile(0.25)
            if self.tail == "right":
                self.imputer_dict_ = (X[self.variables_].quantile(0.75) +
                                      (IQR * self.fold)).to_dict()
            elif self.tail == "left":
                self.imputer_dict_ = (X[self.variables_].quantile(0.25) -
                                      (IQR * self.fold)).to_dict()

        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y: pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.
        """
        X = check_X(X)

        # find variables to be capped
        if self.min_capping_dict is None and self.max_capping_dict:
            self.variables_ = [x for x in self.max_capping_dict.keys()]
        elif self.max_capping_dict is None and self.min_capping_dict:
            self.variables_ = [x for x in self.min_capping_dict.keys()]
        elif self.min_capping_dict and self.max_capping_dict:
            tmp = self.min_capping_dict.copy()
            tmp.update(self.max_capping_dict)
            self.variables_ = [x for x in tmp.keys()]

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables_)

        if self.max_capping_dict is not None:
            self.right_tail_caps_ = self.max_capping_dict
        else:
            self.right_tail_caps_ = {}

        if self.min_capping_dict is not None:
            self.left_tail_caps_ = self.min_capping_dict
        else:
            self.left_tail_caps_ = {}

        self.feature_names_in_ = X.columns.to_list()
        self.n_features_in_ = X.shape[1]

        return self
Exemple #22
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        Fits the Scikit-learn transformer to the selected variables.

        Parameters
        ----------
        X: Pandas DataFrame
            The dataset to fit the transformer.

        y: pandas Series, default=None
            The target variable.
        """

        # check input dataframe
        X = check_X(X)

        self.transformer_ = clone(self.transformer)

        if self.transformer_.__class__.__name__ in [
                "OneHotEncoder",
                "OrdinalEncoder",
                "SimpleImputer",
                "FunctionTransformer",
        ]:
            self.variables_ = _find_all_variables(X, self.variables)

        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer_.fit(X[self.variables_], y)

        if self.transformer_.__class__.__name__ in _SELECTORS:
            # Find features to drop.
            selected = X[self.variables_].columns[
                self.transformer_.get_support()]
            self.features_to_drop_ = [
                f for f in self.variables_ if f not in selected
            ]

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        self.n_features_in_ = X.shape[1]

        return self
Exemple #23
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """Learn the numbers to be used to replace the categories in each
        variable.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to be encoded.

        y: pandas series, default=None
            The Target. Can be None if `encoding_method='arbitrary'`.
            Otherwise, y needs to be passed when fitting the transformer.
        """

        if self.encoding_method == "ordered":
            X, y = check_X_y(X, y)
        else:
            X = check_X(X)

        self._fit(X)
        self._get_feature_names_in(X)

        if self.encoding_method == "ordered":
            temp = pd.concat([X, y], axis=1)
            temp.columns = list(X.columns) + ["target"]

        # find mappings
        self.encoder_dict_ = {}

        for var in self.variables_:

            if self.encoding_method == "ordered":
                t = (temp.groupby(
                    [var])["target"].mean().sort_values(ascending=True).index)

            elif self.encoding_method == "arbitrary":
                t = X[var].unique()

            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}

        self._check_encoding_dictionary()

        return self
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Drops variables that were not seen in the train set and adds variables that
        were in the train set but not in the data to transform. In other words, it
        returns a dataframe with matching columns.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features]
             The dataframe with variables that match those observed in the train set.
        """
        check_is_fitted(self)

        X = check_X(X)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.feature_names_in_)

        _columns_to_drop = list(set(X.columns) - set(self.feature_names_in_))
        _columns_to_add = list(set(self.feature_names_in_) - set(X.columns))

        if self.verbose:
            if len(_columns_to_add) > 0:
                print(
                    "The following variables are added to the DataFrame: "
                    f"{_columns_to_add}"
                )
            if len(_columns_to_drop) > 0:
                print(
                    "The following variables are dropped from the DataFrame: "
                    f"{_columns_to_drop}"
                )

        X = X.drop(_columns_to_drop, axis=1)

        X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value)

        return X
Exemple #25
0
    def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Convert the transformed variables back to the original values. Only
        implemented for the following Scikit-learn transformers:

        PowerTransformer, QuantileTransformer, OrdinalEncoder,
        MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler.

        If you would like this method implemented for additional transformers,
        please check if they have the inverse_transform method in Scikit-learn and then
        raise an issue in our repo.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features].
            The transformed dataframe.

        Returns
        -------
        X_tr: pandas dataframe of shape = [n_samples, n_features].
            The dataframe with the original values.
        """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        if self.transformer_.__class__.__name__ not in _INVERSE_TRANSFORM:
            raise NotImplementedError(
                "The method `inverse_transform` is not implemented for this "
                "transformer. Supported transformers are {}.".format(
                    ", ".join(_INVERSE_TRANSFORM)))
        # For safety, we check that the transformer has the method implemented.
        if hasattr(self.transformer_, "inverse_transform") and callable(
                self.transformer_.inverse_transform):
            X[self.variables_] = self.transformer_.inverse_transform(
                X[self.variables_])
        else:
            raise NotImplementedError(
                "This Scikit-learn transformer does not have the method "
                "`inverse_transform` implemented.")
        return X
Exemple #26
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Makes a copy of the train set. Only stores a copy of the variables to impute.
        This copy is then used to randomly extract the values to fill the missing data
        during transform.

        Parameters
        ----------

        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables to impute
        self.variables_ = _find_all_variables(X, self.variables)

        # take a copy of the selected variables
        self.X_ = X[self.variables_].copy()

        # check the variables assigned to the random state
        if self.seed == "observation":
            self.random_state = _check_input_parameter_variables(
                self.random_state)
            if isinstance(self.random_state, (int, str)):
                self.random_state = [self.random_state]
            if self.random_state and any(
                    var for var in self.random_state if var not in X.columns):
                raise ValueError(
                    "There are variables assigned as random state which are not part "
                    "of the training dataframe.")

        self._get_feature_names_in(X)

        return self
Exemple #27
0
    def _check_transform_input_and_state(self,
                                         X: pd.DataFrame) -> pd.DataFrame:
        """Checks that the input is a dataframe and of the same size than the one used
        in the fit method. Checks absence of NA.

        Parameters
        ----------
        X: Pandas DataFrame

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            If the dataframe is not of same size as that used in fit()

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered by the user.
        """
        # check if class was fitted
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_X_matches_training_df(X, self.n_features_in_)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # reorder to match training set
        X = X[self.feature_names_in_]

        return X
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common checks performed before the feature transformation.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.
        """
        # check method fit has been called
        check_is_fitted(self)

        # check if 'X' is a dataframe
        X = check_X(X)

        # check if input data contains the same number of columns as the fitted
        # dataframe.
        _check_X_matches_training_df(X, self.n_features_in_)

        # Dataframes must have unique values in the index and no missing data.
        # Otherwise, when we merge the created features we will duplicate rows.
        self._check_index(X)

        # check if dataset contains na
        if self.missing_values == "raise":
            self._check_na_and_inf(X)

        # reorder variables to match train set
        X = X[self.feature_names_in_]

        if self.sort_index is True:
            X.sort_index(inplace=True)

        return X
Exemple #29
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        # combine mathematically
        for new_variable_name, operation in self.combination_dict_.items():
            X[new_variable_name] = X[self.variables_to_combine].agg(operation,
                                                                    axis=1)

        if self.drop_original:
            X.drop(columns=self.variables_to_combine, inplace=True)

        return X
Exemple #30
0
    def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
         Replace original values by the average of the target mean value per bin or
         category in each one of the variables.

         Parameters
         ----------
         X : pandas dataframe of shape = [n_samples, n_features]
             The input samples.

         Return
         -------
        X_new: pandas dataframe of shape = [n_samples, n_features]
            The transformed data with the discrete variables.
        """
        # check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check input data contains same number of columns as df used to fit
        _check_X_matches_training_df(X, self.n_features_in_)

        # check for missing values
        _check_contains_na(X, self.variables_numerical_)
        _check_contains_na(X, self.variables_categorical_)

        # check inf
        _check_contains_inf(X, self.variables_numerical_)

        # reorder dataframe to match train set
        X = X[self.feature_names_in_]

        # transform dataframe
        X_tr = self._pipeline.transform(X)

        return X_tr