def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Common input and transformer checks.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables)
            _check_contains_inf(X, self.variables)

        # reorder variables to match train set
        X = X[self.feature_names_in_]

        return X
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Common set-up of creation transformers.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y: pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables are numerical
        self.variables: List[Union[str,
                                   int]] = _find_or_check_numerical_variables(
                                       X, self.variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables)
            _check_contains_inf(X, self.variables)

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return X
Example #3
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Perform dataframe checks. Creates dictionary of operation to new feature
        name pairs.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        if self.math_operations is None:
            self.math_operations_ = [
                "sum", "prod", "mean", "std", "max", "min"
            ]
        else:
            self.math_operations_ = self.math_operations

        # dictionary of new_variable_name to operation pairs
        if self.new_variables_names:
            self.combination_dict_ = dict(
                zip(self.new_variables_names, self.math_operations_))
        else:
            if all(isinstance(var, str) for var in self.variables_to_combine):
                vars_ls = self.variables_to_combine
            else:
                vars_ls = [str(var) for var in self.variables_to_combine]

            self.combination_dict_ = {
                f"{operation}({'-'.join(vars_ls)})": operation  # type: ignore
                for operation in self.math_operations_
            }

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Example #4
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter. Performs dataframe checks.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Default=None.
            It is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
           - If the input is not a Pandas DataFrame
           - If any user provided variables are not numerical
        ValueError
           If any of the reference variables contain null values and the
           mathematical operation is 'div'.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check reference_variables are numerical
        self.reference_variables = _find_or_check_numerical_variables(
            X, self.reference_variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer with div.")

        self.n_features_in_ = X.shape[1]

        return self
Example #5
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y: pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """
        X = _is_dataframe(X)

        # find variables to be capped
        if self.min_capping_dict is None and self.max_capping_dict:
            self.variables_ = [x for x in self.max_capping_dict.keys()]
        elif self.max_capping_dict is None and self.min_capping_dict:
            self.variables_ = [x for x in self.min_capping_dict.keys()]
        elif self.min_capping_dict and self.max_capping_dict:
            tmp = self.min_capping_dict.copy()
            tmp.update(self.max_capping_dict)
            self.variables_ = [x for x in tmp.keys()]

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        if self.max_capping_dict is not None:
            self.right_tail_caps_ = self.max_capping_dict
        else:
            self.right_tail_caps_ = {}

        if self.min_capping_dict is not None:
            self.left_tail_caps_ = self.min_capping_dict
        else:
            self.left_tail_caps_ = {}

        self.n_features_in_ = X.shape[1]

        return self
    def _check_na_and_inf(self, X: pd.DataFrame):
        """
        Checks that the dataframe does not contain NaN or Infinite values.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The dataset for training or transformation.
        """
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        return self
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y: pandas Series, or np.array. Default=None.
            It is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine)

        # check reference_variables are numerical
        self.reference_variables = _find_or_check_numerical_variables(
            X, self.reference_variables)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer with div.")

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        # save train set shape
        self.n_features_in_ = X.shape[1]

        return self
Example #8
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn any parameter.

        Check dataframe and variables. Checks that the user entered variables are in
        the train set and cast as numerical.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset. Can be the entire dataframe, not just the
            variables to be transformed.

        y: None
            y is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame
            - If any of the user provided variables are not numerical
        ValueError
            - If there are no numerical variables in the df or the df is empty
            - If the variable(s) contain null values

        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        variables = [x for x in self.binning_dict.keys()]
        self.variables_ = _find_or_check_numerical_variables(X, variables)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        # for consistency wit the rest of the discretisers, we add this attribute
        self.binner_dict_ = self.binning_dict

        self.n_features_in_ = X.shape[1]

        return self
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Raises
        ------
        TypeError
           If the input is not a Pandas DataFrame
        ValueError
           - If the variable(s) contain null values when missing_values = raise
           - If the dataframe is not of the same size as that used in fit()

        Returns
        -------
        X: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_input_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        # combine mathematically
        for new_variable_name, operation in self.combination_dict_.items():
            X[new_variable_name] = X[self.variables_to_combine].agg(operation,
                                                                    axis=1)

        return X
    def fit(self,
            X: pd.DataFrame,
            y: Optional[pd.Series] = None) -> pd.DataFrame:
        """
        Checks that input is a dataframe, finds numerical variables, or alternatively
        checks that variables entered by the user are of type numerical.

        Parameters
        ----------
        X : Pandas DataFrame

        y : Pandas Series, np.array. Default = None
            Parameter is necessary for compatibility with sklearn Pipeline.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame or a numpy array
            If any of the user provided variables are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty
            If the variable(s) contain null values

        Returns
        -------
        X : Pandas DataFrame
            The same dataframe entered as parameter
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables_: List[Union[str,
                                    int]] = _find_or_check_numerical_variables(
                                        X, self.variables)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        return X
Example #11
0
    def _check_transform_input_and_state(self,
                                         X: pd.DataFrame) -> pd.DataFrame:
        """Checks that the input is a dataframe and of the same size than the one used
        in the fit method. Checks absence of NA.

        Parameters
        ----------
        X: Pandas DataFrame

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            If the dataframe is not of same size as that used in fit()

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered by the user.
        """
        # check if class was fitted
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check that the dataframe contains the same number of columns
        # than the dataframe used to fit the imputer.
        _check_X_matches_training_df(X, self.n_features_in_)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # reorder to match training set
        X = X[self.feature_names_in_]

        return X
Example #12
0
    def _select_variables_from_dict(self, X: pd.DataFrame,
                                    user_dict_: Dict) -> pd.DataFrame:
        """
        Checks that input is a dataframe, checks that variables in the dictionary
        entered by the user are of type numerical.

        Parameters
        ----------
        X : Pandas DataFrame

        user_dict_ : Dictionary. Default = None
            Any dictionary allowed by the transformer and entered by user.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame or a numpy array
            If any of the variables in the dictionary are not numerical
        ValueError
            If there are no numerical variables in the df or the df is empty
            If the variable(s) contain null values

        Returns
        -------
        X : Pandas DataFrame
            The same dataframe entered as parameter
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        variables = [x for x in user_dict_.keys()]
        self.variables_ = _find_or_check_numerical_variables(X, variables)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        return X
Example #13
0
    def _transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
         Replace original values by the average of the target mean value per bin or
         category in each one of the variables.

         Parameters
         ----------
         X : pandas dataframe of shape = [n_samples, n_features]
             The input samples.

         Return
         -------
        X_new: pandas dataframe of shape = [n_samples, n_features]
            The transformed data with the discrete variables.
        """
        # check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check input data contains same number of columns as df used to fit
        _check_X_matches_training_df(X, self.n_features_in_)

        # check for missing values
        _check_contains_na(X, self.variables_numerical_)
        _check_contains_na(X, self.variables_categorical_)

        # check inf
        _check_contains_inf(X, self.variables_numerical_)

        # reorder dataframe to match train set
        X = X[self.feature_names_in_]

        # transform dataframe
        X_tr = self._pipeline.transform(X)

        return X_tr
Example #14
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X_new: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the original variables plus the new variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = check_X(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_X_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.variables_to_combine)
            _check_contains_inf(X, self.variables_to_combine)

        # combine mathematically
        for new_variable_name, operation in self.combination_dict_.items():
            X[new_variable_name] = X[self.variables_to_combine].agg(operation,
                                                                    axis=1)

        if self.drop_original:
            X.drop(columns=self.variables_to_combine, inplace=True)

        return X
Example #15
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Checks that the input is a dataframe and of the same size than the one used
        in the fit method. Checks absence of NA and Inf.

        Parameters
        ----------
        X : Pandas DataFrame

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame
        ValueError
            - If the variable(s) contain null values
            - If the df has different number of features than the df used in fit()

        Returns
        -------
        X : Pandas DataFrame.
            The same dataframe entered by the user.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_input_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na or inf
        _check_contains_na(X, self.variables_)
        _check_contains_inf(X, self.variables_)

        return X
Example #16
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # check that there are more than 1 variable to select from
        self._check_variable_number()

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # set to collect features that are correlated
        self.features_to_drop_ = set()

        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        _correlated_matrix = X[self.variables_].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in _correlated_matrix.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in _correlated_matrix.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(_correlated_matrix.loc[f2,
                                                  feature]) > self.threshold:

                        # add feature (f2) to our correlated set
                        self.features_to_drop_.add(f2)
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        # save input features
        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated feature groups. Determine which feature should be selected
        from each group.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series. Default = None
            y is needed if selection_method == 'model_performance'.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        if self.selection_method == "model_performance" and y is None:
            raise ValueError("y is needed to fit the transformer")

        # FIND CORRELATED FEATURES
        # ========================
        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        _correlated_matrix = X[self.variables_].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in _correlated_matrix.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in _correlated_matrix.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(_correlated_matrix.loc[f2,
                                                  feature]) > self.threshold:
                        # add feature (f2) to our correlated set
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        # SELECT 1 FEATURE FROM EACH GROUP
        # ================================

        # list to collect selected features
        # we start it with all features that were either not examined, i.e., categorical
        # variables, or not found correlated
        _selected_features = [
            f for f in X.columns
            if f not in set().union(*self.correlated_feature_sets_)
        ]

        # select the feature with least missing values
        if self.selection_method == "missing_values":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].isnull().sum().sort_values(
                    ascending=True).index[0]
                _selected_features.append(f)

        # select the feature with most unique values
        elif self.selection_method == "cardinality":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].nunique().sort_values(
                    ascending=False).index[0]
                _selected_features.append(f)

        # select the feature with biggest variance
        elif self.selection_method == "variance":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].std().sort_values(
                    ascending=False).index[0]
                _selected_features.append(f)

        # select best performing feature according to estimator
        else:
            for feature_group in self.correlated_feature_sets_:

                # feature_group = list(feature_group)
                temp_perf = []

                # train a model for every feature
                for feature in feature_group:
                    model = cross_validate(
                        self.estimator,
                        X[feature].to_frame(),
                        y,
                        cv=self.cv,
                        return_estimator=False,
                        scoring=self.scoring,
                    )

                    temp_perf.append(model["test_score"].mean())

                # select best performing feature from group
                f = list(feature_group)[temp_perf.index(max(temp_perf))]
                _selected_features.append(f)

        self.features_to_drop_ = [
            f for f in self.variables_ if f not in _selected_features
        ]

        self.n_features_in_ = X.shape[1]

        return self
def test_contains_inf(df_na):
    df_na.fillna(np.inf, inplace=True)
    with pytest.raises(ValueError):
        assert _check_contains_inf(df_na, ["Age", "Marks"])
Example #19
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find features with high PSI values.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check those entered are present in the dataframe
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # Remove the split_col from the variables list. It might be added if the
        # variables are not defined at initialization.
        if self.split_col in self.variables_:
            self.variables_.remove(self.split_col)

        if self.missing_values == "raise":
            # check if dataset contains na or inf
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # Split the dataframe into basis and test.
        basis_df, test_df = self._split_dataframe(X)

        # Check the shape of the returned dataframes for PSI calculations.
        # The number of observations must be at least equal to the
        # number of bins.
        if min(basis_df.shape[0], test_df.shape[0]) < self.bins:
            raise ValueError(
                "The number of rows in the basis and test datasets that will be used "
                f"in the PSI calculations must be at least larger than {self.bins}. "
                "After slitting the original dataset based on the given cut_off or"
                f"split_frac we have {basis_df.shape[0]} samples in the basis set, "
                f"and {test_df.shape[0]} samples in the test set. "
                "Please adjust the value of the cut_off or split_frac.")

        # Switch basis and test dataframes if required.
        if self.switch:
            test_df, basis_df = basis_df, test_df

        # set up the discretizer
        if self.strategy == "equal_width":
            bucketer = EqualWidthDiscretiser(bins=self.bins)
        else:
            bucketer = EqualFrequencyDiscretiser(q=self.bins)

        # Compute the PSI by looping over the features
        self.psi_values_ = {}
        self.features_to_drop_ = []

        for feature in self.variables_:
            # Discretize the features.

            basis_discrete = bucketer.fit_transform(basis_df[[feature
                                                              ]].dropna())
            test_discrete = bucketer.transform(test_df[[feature]].dropna())

            # Determine percentage of observations per bin
            basis_distrib, test_distrib = self._observation_frequency_per_bin(
                basis_discrete, test_discrete)

            # Calculate the PSI value
            self.psi_values_[feature] = np.sum(
                (test_distrib - basis_distrib) *
                np.log(test_distrib / basis_distrib))
            # Assess if feature should be dropped
            if self.psi_values_[feature] > self.threshold:
                self.features_to_drop_.append(feature)

        # save input features
        self._get_feature_names_in(X)

        return self
Example #20
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the values that should be used to replace outliers.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.
        """

        # check input dataframe
        X = check_X(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        self.right_tail_caps_ = {}
        self.left_tail_caps_ = {}

        # estimate the end values
        if self.tail in ["right", "both"]:
            if self.capping_method == "gaussian":
                self.right_tail_caps_ = (
                    X[self.variables_].mean() +
                    self.fold * X[self.variables_].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables_].quantile(0.75) - X[
                    self.variables_].quantile(0.25)
                self.right_tail_caps_ = (X[self.variables_].quantile(0.75) +
                                         (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.right_tail_caps_ = (
                    X[self.variables_].quantile(1 - self.fold).to_dict())

        if self.tail in ["left", "both"]:
            if self.capping_method == "gaussian":
                self.left_tail_caps_ = (
                    X[self.variables_].mean() -
                    self.fold * X[self.variables_].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables_].quantile(0.75) - X[
                    self.variables_].quantile(0.25)
                self.left_tail_caps_ = (X[self.variables_].quantile(0.25) -
                                        (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.left_tail_caps_ = X[self.variables_].quantile(
                    self.fold).to_dict()

        self.feature_names_in_ = X.columns.to_list()
        self.n_features_in_ = X.shape[1]

        return self
Example #21
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the operations results added as columns.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_input_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer.")

        original_col_names = [var for var in X.columns]
        # Add new features and values into de data frame.
        if "sub" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_sub_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].sub(X[reference],
                                                              axis=0)
        if "div" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_div_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].div(X[reference],
                                                              axis=0)
        if "add" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_add_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].add(X[reference],
                                                              axis=0)
        if "mul" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_mul_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].mul(X[reference],
                                                              axis=0)

        # replace created variable names with user ones.
        if self.new_variables_names:
            X.columns = original_col_names + self.new_variables_names

        return X
Example #22
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learn the mean target value per category or bin.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : pandas series of shape = [n_samples,]
            The target variable.
        """
        # check if 'X' is a dataframe
        X, y = check_X_y(X, y)

        # find categorical and numerical variables
        (
            self.variables_categorical_,
            self.variables_numerical_,
        ) = _find_categorical_and_numerical_variables(X, self.variables)

        # check for missing values
        _check_contains_na(X, self.variables_numerical_)
        _check_contains_na(X, self.variables_categorical_)

        # check inf
        _check_contains_inf(X, self.variables_numerical_)

        # Create pipelines
        if self.variables_categorical_ and self.variables_numerical_:
            self._pipeline = self._make_combined_pipeline()

        elif self.variables_categorical_:
            self._pipeline = self._make_categorical_pipeline()

        else:
            self._pipeline = self._make_numerical_pipeline()

        # Train pipeline
        self._pipeline.fit(X, y)

        # Assign attributes (useful to interpret features)
        # Use dict() to make a copy of the dictionary. Otherwise, like in pandas,
        # it is just another view of the same data, mind-blowing.
        if self.variables_categorical_ and self.variables_numerical_:
            self.binner_dict_ = dict(
                self._pipeline.named_steps["discretiser"].binner_dict_
            )
            self.encoder_dict_ = dict(
                self._pipeline.named_steps["encoder_num"].encoder_dict_
            )
            tmp_dict = dict(self._pipeline.named_steps["encoder_cat"].encoder_dict_)
            self.encoder_dict_.update(tmp_dict)

        elif self.variables_categorical_:
            self.binner_dict_ = {}
            self.encoder_dict_ = dict(self._pipeline.encoder_dict_)

        else:
            self.binner_dict_ = dict(
                self._pipeline.named_steps["discretiser"].binner_dict_
            )
            self.encoder_dict_ = dict(
                self._pipeline.named_steps["encoder"].encoder_dict_
            )

        # store input features
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = list(X.columns)

        return self