Esempio n. 1
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Applies transformation to the DataFrame.

        Args:
            X: Pandas DataFrame to apply the transformation

        Returns:
            Transformed DataFrame
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_input_matches_training_df(X, self.input_shape_[1])

        return X
    def inverse_transform(self, X):
        """ Convert the data back to the original representation.

        Parameters
        ----------

        X_transformed : pandas dataframe of shape = [n_samples, n_features].
            The transformed dataframe.

        Returns
        -------

        X : pandas dataframe of shape = [n_samples, n_features].
            The un-transformed dataframe, that is, containing the original values
            of the categorical variables.
       """
        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # Check that the dataframe contains the same number of columns
        # than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # replace encoded categories by the original values
        for feature in self.encoder_dict_.keys():
            inv_map = {v: k for k, v in self.encoder_dict_[feature].items()}
            X[feature] = X[feature].map(inv_map)

        return X
Esempio n. 3
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # set to collect features that are correlated
        self.correlated_features_ = set()

        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        self.correlated_matrix_ = X[self.variables].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in self.correlated_matrix_.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in self.correlated_matrix_.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(self.correlated_matrix_.loc[
                            f2, feature]) > self.threshold:

                        # add feature (f2) to our correlated set
                        self.correlated_features_.add(f2)
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Finds the correlated features

        Args:
            X: pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to transform.

            y: It is not needed in this transformer. Defaults to None.
            Alternatively takes Pandas Series.ss

        Returns:
            self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables = _find_numerical_variables(X, self.variables)

        # set to collect features that are correlated
        self.correlated_features_ = set()

        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        self.correlated_matrix_ = X[self.variables].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in self.correlated_matrix_.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in self.correlated_matrix_.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(self.correlated_matrix_.loc[
                            f2, feature]) > self.threshold:

                        # add feature (f2) to our correlated set
                        self.correlated_features_.add(f2)
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
Esempio n. 5
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features. Note that the selector trains various models at
        each round of selection, so it might take a while.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables],
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # Initialize a dataframe that will contain the list of the feature/coeff
        # importance for each cross validation fold
        feature_importances_cv = pd.DataFrame()

        # Populate the feature_importances_cv dataframe with columns containing
        # the feature importance values for each model returned by the cross
        # validation.
        # There are as many columns as folds.
        for m in model["estimator"]:

            feature_importances_cv[m] = get_feature_importances(m)

        # Add the variables as index to feature_importances_cv
        feature_importances_cv.index = self.variables

        # Aggregate the feature importance returned in each fold
        self.feature_importances_ = feature_importances_cv.mean(axis=1)

        # Sort the feature importance values decreasingly
        self.feature_importances_.sort_values(ascending=False, inplace=True)

        # Extract most important feature from the ordered list of features
        first_most_important_feature = list(self.feature_importances_.index)[0]

        # Run baseline model using only the most important feature
        baseline_model = cross_validate(
            self.estimator,
            X[first_most_important_feature].to_frame(),
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # Save baseline model performance
        baseline_model_performance = baseline_model["test_score"].mean()

        # list to collect selected features
        # It is initialized with the most important feature
        _selected_features = [first_most_important_feature]

        # dict to collect features and their performance_drift
        # It is initialized with the performance drift of
        # the most important feature
        self.performance_drifts_ = {first_most_important_feature: 0}

        # loop over the ordered list of features by feature importance starting
        # from the second element in the list.
        for feature in list(self.feature_importances_.index)[1:]:

            # Add feature and train new model
            model_tmp = cross_validate(
                self.estimator,
                X[_selected_features + [feature]],
                y,
                cv=self.cv,
                scoring=self.scoring,
                return_estimator=True,
            )

            # assign new model performance
            model_tmp_performance = model_tmp["test_score"].mean()

            # Calculate performance drift
            performance_drift = model_tmp_performance - baseline_model_performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

            # If new performance model is
            if performance_drift > self.threshold:

                # add feature to the list of selected features
                _selected_features.append(feature)

                # Update new baseline model performance
                baseline_model_performance = model_tmp_performance

        self.features_to_drop_ = [
            f for f in self.variables if f not in _selected_features
        ]

        self.input_shape_ = X.shape

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find the correlated feature groups. Determine which feature should be selected
        from each group.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas series. Default = None
            y is needed if selection_method == 'model_performance'.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all numerical variables or check those entered are in the dataframe
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        if self.selection_method == "model_performance" and y is None:
            raise ValueError("y is needed to fit the transformer")

        # FIND CORRELATED FEATURES
        # ========================
        # create tuples of correlated feature groups
        self.correlated_feature_sets_ = []

        # the correlation matrix
        _correlated_matrix = X[self.variables_].corr(method=self.method)

        # create set of examined features, helps to determine feature combinations
        # to evaluate below
        _examined_features = set()

        # for each feature in the dataset (columns of the correlation matrix)
        for feature in _correlated_matrix.columns:

            if feature not in _examined_features:

                # append so we can exclude when we create the combinations
                _examined_features.add(feature)

                # here we collect potentially correlated features
                # we need this for the correlated groups sets
                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found correlated
                _features_to_compare = [
                    f for f in _correlated_matrix.columns
                    if f not in _examined_features
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    # if the correlation is higher than the threshold
                    # we are interested in absolute correlation coefficient value
                    if abs(_correlated_matrix.loc[f2,
                                                  feature]) > self.threshold:
                        # add feature (f2) to our correlated set
                        _temp_set.add(f2)
                        _examined_features.add(f2)

                # if there are correlated features
                if len(_temp_set) > 1:
                    self.correlated_feature_sets_.append(_temp_set)

        # SELECT 1 FEATURE FROM EACH GROUP
        # ================================

        # list to collect selected features
        # we start it with all features that were either not examined, i.e., categorical
        # variables, or not found correlated
        _selected_features = [
            f for f in X.columns
            if f not in set().union(*self.correlated_feature_sets_)
        ]

        # select the feature with least missing values
        if self.selection_method == "missing_values":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].isnull().sum().sort_values(
                    ascending=True).index[0]
                _selected_features.append(f)

        # select the feature with most unique values
        elif self.selection_method == "cardinality":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].nunique().sort_values(
                    ascending=False).index[0]
                _selected_features.append(f)

        # select the feature with biggest variance
        elif self.selection_method == "variance":
            for feature_group in self.correlated_feature_sets_:
                f = X[feature_group].std().sort_values(
                    ascending=False).index[0]
                _selected_features.append(f)

        # select best performing feature according to estimator
        else:
            for feature_group in self.correlated_feature_sets_:

                # feature_group = list(feature_group)
                temp_perf = []

                # train a model for every feature
                for feature in feature_group:
                    model = cross_validate(
                        self.estimator,
                        X[feature].to_frame(),
                        y,
                        cv=self.cv,
                        return_estimator=False,
                        scoring=self.scoring,
                    )

                    temp_perf.append(model["test_score"].mean())

                # select best performing feature from group
                f = list(feature_group)[temp_perf.index(max(temp_perf))]
                _selected_features.append(f)

        self.features_to_drop_ = [
            f for f in self.variables_ if f not in _selected_features
        ]

        self.n_features_in_ = X.shape[1]

        return self
    def fit(self, X, y):
        """
        Learns the numbers that should be used to replace the categories in each
        variable. That is the WoE or ratio of probability.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            Target, must be binary [0,1].

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the {category: WoE / ratio} pairs per variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        if y is None:
            raise ValueError(
                'Please provide a target y for this encoding method')

        # check that y is binary
        if len([x for x in y.unique() if x not in [0, 1]]) > 0:
            raise ValueError(
                "This encoder is only designed for binary classification, values of y can be only 0 or 1"
            )

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ['target']

        self.encoder_dict_ = {}

        if self.encoding_method == 'woe':
            total_pos = temp['target'].sum()
            total_neg = len(temp) - total_pos
            temp['non_target'] = np.where(temp['target'] == 1, 0, 1)

            for var in self.variables:
                pos = temp.groupby([var])['target'].sum() / total_pos
                neg = temp.groupby([var])['non_target'].sum() / total_neg

                t = pd.concat([pos, neg], axis=1)
                t['woe'] = np.log(t['target'] / t['non_target'])

                if not t.loc[t['target'] == 0, :].empty or not t.loc[
                        t['non_target'] == 0, :].empty:
                    raise ValueError(
                        "The proportion of 1 of the classes for a category in variable {} is zero, and log of zero is "
                        "not defined".format(var))

                self.encoder_dict_[var] = t['woe'].to_dict()

        else:
            for var in self.variables:
                t = temp.groupby(var)['target'].mean()
                t = pd.concat([t, 1 - t], axis=1)
                t.columns = ['p1', 'p0']

                if self.encoding_method == 'log_ratio':
                    if not t.loc[t['p0'] == 0, :].empty or not t.loc[
                            t['p1'] == 0, :].empty:
                        raise ValueError(
                            "p(0) or p(1) for a category in variable {} is zero, log of zero is not defined"
                            .format(var))
                    else:
                        self.encoder_dict_[var] = (np.log(t.p1 /
                                                          t.p0)).to_dict()

                elif self.encoding_method == 'ratio':
                    if not t.loc[t['p0'] == 0, :].empty:
                        raise ValueError(
                            "p(0) for a category in variable {} is zero, division by 0 is not defined"
                            .format(var))
                    else:
                        self.encoder_dict_[var] = (t.p1 / t.p0).to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Esempio n. 8
0
    def fit(self, X, y=None):
        """
        Learns the frequent categories for each variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just selected variables

        y : None
            y is not required. You can pass y or None.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the frequent categories (that will be kept)
            for each variable. Categories not present in this list will be replaced
            by 'Rare' or by the user defined value.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        for var in self.variables:
            if len(X[var].unique()) > self.n_categories:

                # if the variable has more than the indicated number of categories
                # the encoder will learn the most frequent categories
                t = pd.Series(X[var].value_counts() / np.float(len(X)))

                # non-rare labels:
                freq_idx = t[t >= self.tol].index

                if self.max_n_categories:
                    self.encoder_dict_[var] = freq_idx[:self.max_n_categories]
                else:
                    self.encoder_dict_[var] = freq_idx

            else:
                # if the total number of categories is smaller than the indicated
                # the encoder will consider all categories as frequent.
                warnings.warn("The number of unique categories for variable {} is less than that indicated in "
                              "n_categories. Thus, all categories will be considered frequent".format(var))
                self.encoder_dict_[var] = X[var].unique()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y):
        """

        Args
        ----

        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.


        Returns
        -------

        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find numerical variables or check variables entered by user
        self.variables = _find_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X[self.variables],
            y,
            cv=self.cv,
            scoring=self.scoring,
            return_estimator=True,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # Initialize a dataframe that will contain the list of the feature/coeff
        # importance for each cross validation fold
        feature_importances_cv = pd.DataFrame()

        # Populate the feature_importances_cv dataframe with columns containing
        # the feature importance values for each model returned by the cross
        # validation.
        # There are as many columns as folds.
        for m in model["estimator"]:

            feature_importances_cv[m] = get_feature_importances(m)

        # Add the variables as index to feature_importances_cv
        feature_importances_cv.index = self.variables

        # Aggregate the feature importance returned in each fold
        self.feature_importances_ = feature_importances_cv.mean(axis=1)

        # Sort the feature importance values
        self.feature_importances_.sort_values(ascending=True, inplace=True)

        # list to collect selected features
        self.selected_features_ = []

        # temporary copy where we will remove features recursively
        X_tmp = X[self.variables].copy()

        # we need to update the performance as we remove features
        baseline_model_performance = self.initial_model_performance_

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # evaluate every feature, starting from the least important
        # remember that feature_importances_ is ordered already
        for feature in list(self.feature_importances_.index):

            # remove feature and train new model
            model_tmp = cross_validate(
                self.estimator,
                X_tmp.drop(columns=feature),
                y,
                cv=self.cv,
                scoring=self.scoring,
                return_estimator=False,
            )

            # assign new model performance
            model_tmp_performance = model_tmp["test_score"].mean()

            # Calculate performance drift
            performance_drift = baseline_model_performance - model_tmp_performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

            if performance_drift > self.threshold:

                self.selected_features_.append(feature)

            else:
                # remove feature and adjust initial performance
                X_tmp = X_tmp.drop(columns=feature)

                baseline_model = cross_validate(
                    self.estimator,
                    X_tmp,
                    y,
                    cv=self.cv,
                    return_estimator=False,
                    scoring=self.scoring,
                )

                # store initial model performance
                baseline_model_performance = baseline_model["test_score"].mean(
                )

        self.input_shape_ = X.shape

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe
        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # reset the index
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        # find numerical variables or check variables entered by user
        self.variables = _find_or_check_numerical_variables(X, self.variables)

        # train model with all features and cross-validation
        model = cross_validate(
            self.estimator,
            X,
            y,
            cv=self.cv,
            return_estimator=True,
            scoring=self.scoring,
        )

        # store initial model performance
        self.initial_model_performance_ = model["test_score"].mean()

        # get performance metric
        scorer = get_scorer(self.scoring)

        # dict to collect features and their performance_drift after shuffling
        self.performance_drifts_ = {}

        # list to collect selected features
        self.selected_features_ = []

        # shuffle features and save feature performance drift into a dict
        for feature in self.variables:

            X_shuffled = X.copy()

            # shuffle individual feature
            X_shuffled[feature] = (
                X_shuffled[feature].sample(frac=1).reset_index(drop=True)
            )

            # determine the performance with the shuffled feature
            performance = np.mean(
                [scorer(m, X_shuffled, y) for m in model["estimator"]]
            )

            # determine drift in performance
            # Note, sklearn negates the log and error scores, so no need to manually
            # do the invertion
            # https://scikit-learn.org/stable/modules/model_evaluation.html
            # (https://scikit-learn.org/stable/modules/model_evaluation.html
            # #the-scoring-parameter-defining-model-evaluation-rules)
            performance_drift = self.initial_model_performance_ - performance

            # Save feature and performance drift
            self.performance_drifts_[feature] = performance_drift

        # select features
        for feature in self.performance_drifts_.keys():

            if self.performance_drifts_[feature] > self.threshold:

                self.selected_features_.append(feature)

        self.input_shape_ = X.shape

        return self
Esempio n. 11
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the values that should be used to replace outliers.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : pandas Series, default=None
            y is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables_ = _find_or_check_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        self.right_tail_caps_ = {}
        self.left_tail_caps_ = {}

        # estimate the end values
        if self.tail in ["right", "both"]:
            if self.capping_method == "gaussian":
                self.right_tail_caps_ = (
                    X[self.variables_].mean() +
                    self.fold * X[self.variables_].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables_].quantile(0.75) - X[
                    self.variables_].quantile(0.25)
                self.right_tail_caps_ = (X[self.variables_].quantile(0.75) +
                                         (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.right_tail_caps_ = (
                    X[self.variables_].quantile(1 - self.fold).to_dict())

        if self.tail in ["left", "both"]:
            if self.capping_method == "gaussian":
                self.left_tail_caps_ = (
                    X[self.variables_].mean() -
                    self.fold * X[self.variables_].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables_].quantile(0.75) - X[
                    self.variables_].quantile(0.25)
                self.left_tail_caps_ = (X[self.variables_].quantile(0.25) -
                                        (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.left_tail_caps_ = X[self.variables_].quantile(
                    self.fold).to_dict()

        self.n_features_in_ = X.shape[1]

        return self
Esempio n. 12
0
def test_is_dataframe(df_vartypes):
    assert_frame_equal(_is_dataframe(df_vartypes), df_vartypes)
    with pytest.raises(TypeError):
        assert _is_dataframe([1, 2, 4])
Esempio n. 13
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learns the values that should be used to replace outliers.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this transformer. You can pass y or None.

        Attributes
        ----------

        right_tail_caps_: dictionary
            The dictionary containing the maximum values at which variables
            will be capped.

        left_tail_caps_ : dictionary
            The dictionary containing the minimum values at which variables
            will be capped.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        self.right_tail_caps_ = {}
        self.left_tail_caps_ = {}

        # estimate the end values
        if self.tail in ["right", "both"]:
            if self.capping_method == "gaussian":
                self.right_tail_caps_ = (
                    X[self.variables].mean() +
                    self.fold * X[self.variables].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables].quantile(0.75) - X[
                    self.variables].quantile(0.25)
                self.right_tail_caps_ = (X[self.variables].quantile(0.75) +
                                         (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.right_tail_caps_ = (
                    X[self.variables].quantile(1 - self.fold).to_dict())

        if self.tail in ["left", "both"]:
            if self.capping_method == "gaussian":
                self.left_tail_caps_ = (
                    X[self.variables].mean() -
                    self.fold * X[self.variables].std()).to_dict()

            elif self.capping_method == "iqr":
                IQR = X[self.variables].quantile(0.75) - X[
                    self.variables].quantile(0.25)
                self.left_tail_caps_ = (X[self.variables].quantile(0.25) -
                                        (IQR * self.fold)).to_dict()

            elif self.capping_method == "quantiles":
                self.left_tail_caps_ = X[self.variables].quantile(
                    self.fold).to_dict()

        self.input_shape_ = X.shape

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find duplicated features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.
        y : None
            y is not needed for this transformer. You can pass y or None.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all variables or check those entered are in the dataframe
        self.variables = _find_all_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # create tuples of duplicated feature groups
        self.duplicated_feature_sets_ = []

        # set to collect features that are duplicated
        self.features_to_drop_ = set()  # type: ignore

        # create set of examined features
        _examined_features = set()

        for feature in self.variables:

            # append so we can remove when we create the combinations
            _examined_features.add(feature)

            if feature not in self.features_to_drop_:

                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found duplicates
                _features_to_compare = [
                    f for f in self.variables if f not in
                    _examined_features.union(self.features_to_drop_)
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    if X[feature].equals(X[f2]):
                        self.features_to_drop_.add(f2)
                        _temp_set.add(f2)

                # if there are duplicated features
                if len(_temp_set) > 1:
                    self.duplicated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
Esempio n. 15
0
    def fit(self, X, y=None):
        """
        Find duplicated features.

        Parameters
        ----------

        X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.

        y: None
            y is not needed for this transformer. You can pass y or None.


        Attributes
        ----------

        duplicated_features_: set
            The duplicated features.

        duplicated_feature_sets_: list
            Groups of duplicated features. Or in other words, features that are
            duplicated with each other. Each list represents a group of duplicated
            features.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all variables or check those entered are in the dataframe
        self.variables = _find_all_variables(X, self.variables)

        # create tuples of duplicated feature groups
        self.duplicated_feature_sets_ = []

        # set to collect features that are duplicated
        self.duplicated_features_ = set()

        # create set of examined features
        _examined_features = set()

        for feature in self.variables:

            # append so we can remove when we create the combinations
            _examined_features.add(feature)

            if feature not in self.duplicated_features_:

                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found duplicates
                _features_to_compare = [
                    f for f in self.variables if f not in
                    _examined_features.union(self.duplicated_features_)
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    if X[feature].equals(X[f2]):
                        self.duplicated_features_.add(f2)
                        _temp_set.add(f2)

                # if there are duplicated features
                if len(_temp_set) > 1:
                    self.duplicated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
Esempio n. 16
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Apply the transformation to the dataframe. Only the selected variables will be
        modified.

        If transformer is the OneHotEncoder, the dummy features will be concatenated
        to the input dataset. Note that the original categorical variables will not be
        removed from the dataset after encoding. If this is the desired effect, please
        use Feature-engine's OneHotEncoder instead.

        Parameters
        ----------
        X: Pandas DataFrame
            The data to transform

        Raises
        ------
         TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        X: Pandas DataFrame
            The transformed dataset.
        """

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that input data contains same number of columns than
        # the dataframe used to fit the imputer.

        _check_input_matches_training_df(X, self.n_features_in_)

        if self.transformer_.__class__.__name__ == "OneHotEncoder":
            ohe_results_as_df = pd.DataFrame(
                data=self.transformer_.transform(X[self.variables_]),
                columns=self.transformer_.get_feature_names(self.variables_),
            )
            X = pd.concat([X, ohe_results_as_df], axis=1)

        elif self.transformer_.__class__.__name__ in [
                "SelectKBest",
                "SelectPercentile",
                "SelectFromModel",
        ]:

            # the variables selected by the transformer
            selected_variables = X.columns[self.transformer_.get_support(
                indices=True)]

            # the variables that were not examined, in case there are any
            remaining_variables = [
                var for var in X.columns if var not in self.variables_
            ]

            X = X[list(selected_variables) + list(remaining_variables)]

        else:
            X[self.variables_] = self.transformer_.transform(
                X[self.variables_])

        return X
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)

        # check variables
        self.variables = _find_all_variables(X, self.variables)

        # check if df contains na
        _check_contains_na(X, self.variables)

        # limit df to variables to smooth code below
        X = X[self.variables].copy()

        # find categorical and numerical variables
        self.variables_categorical_ = list(X.select_dtypes(include="O").columns)
        self.variables_numerical_ = list(
            X.select_dtypes(include=["float", "integer"]).columns
        )

        # obtain cross-validation indeces
        skf = StratifiedKFold(
            n_splits=self.cv, shuffle=True, random_state=self.random_state
        )
        skf.get_n_splits(X, y)

        if self.variables_categorical_ and self.variables_numerical_:
            _pipeline = self._make_combined_pipeline()

        elif self.variables_categorical_:
            _pipeline = self._make_categorical_pipeline()

        else:
            _pipeline = self._make_numerical_pipeline()

        # obtain feature performance with cross-validation
        feature_importances_cv = []

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            _pipeline.fit(X_train, y_train)

            X_test = _pipeline.transform(X_test)

            if self.scoring == "roc_auc_score":
                tmp_split = {
                    f: roc_auc_score(y_test, X_test[f]) for f in self.variables
                }
            else:
                tmp_split = {f: r2_score(y_test, X_test[f]) for f in self.variables}

            feature_importances_cv.append(pd.Series(tmp_split))

        feature_importances_cv = pd.concat(feature_importances_cv, axis=1)

        self.feature_performance_ = feature_importances_cv.mean(  # type: ignore
            axis=1
        ).to_dict()

        self.features_to_drop_ = [
            f
            for f in self.variables
            if self.feature_performance_[f] < self.threshold
        ]

        self.input_shape_ = X.shape

        return self
Esempio n. 18
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Combine the variables with the mathematical operations.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The data to transform.

        Returns
        -------
        X: Pandas dataframe, shape = [n_samples, n_features + n_operations]
            The dataframe with the operations results added as columns.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check if input data contains same number of columns as dataframe used to fit.
        _check_input_matches_training_df(X, self.n_features_in_)

        # check if dataset contains na
        if self.missing_values == "raise":
            _check_contains_na(X, self.reference_variables)
            _check_contains_na(X, self.variables_to_combine)

            _check_contains_inf(X, self.reference_variables)
            _check_contains_inf(X, self.variables_to_combine)

        # cannot divide by 0, as will result in error
        if "div" in self.operations:
            if X[self.reference_variables].isin([0]).any().any():
                raise ValueError(
                    "Some of the reference variables contain 0 as values. Check and "
                    "remove those before using this transformer.")

        original_col_names = [var for var in X.columns]
        # Add new features and values into de data frame.
        if "sub" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_sub_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].sub(X[reference],
                                                              axis=0)
        if "div" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_div_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].div(X[reference],
                                                              axis=0)
        if "add" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_add_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].add(X[reference],
                                                              axis=0)
        if "mul" in self.operations:
            for reference in self.reference_variables:
                varname = [
                    str(var) + "_mul_" + str(reference)
                    for var in self.variables_to_combine
                ]
                X[varname] = X[self.variables_to_combine].mul(X[reference],
                                                              axis=0)

        # replace created variable names with user ones.
        if self.new_variables_names:
            X.columns = original_col_names + self.new_variables_names

        return X
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        This transformer does not learn parameters.

        Perform dataframe checks. Creates dictionary of operation to new feature
        name pairs.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples. Can be the entire dataframe, not just the
            variables to transform.

        y : pandas Series, or np.array. Defaults to None.
            It is not needed in this transformer. You can pass y or None.

        Raises
        ------
        TypeError
           - If the input is not a Pandas DataFrame
           - If any user provided variables in variables_to_combine are not numerical
        ValueError
           If the variable(s) contain null values

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # check variables to combine are numerical
        self.variables_to_combine = _find_or_check_numerical_variables(
            X, self.variables_to_combine
        )

        # check if dataset contains na
        _check_contains_na(X, self.variables_to_combine)

        if self.math_operations is None:
            self.math_operations_ = ["sum", "prod", "mean", "std", "max", "min"]
        else:
            self.math_operations_ = self.math_operations

        # dictionary of new_variable_name to operation pairs
        if self.new_variables_names:
            self.combination_dict_ = dict(
                zip(self.new_variables_names, self.math_operations_)
            )
        else:
            if all(isinstance(var, str) for var in self.variables_to_combine):
                vars_ls = self.variables_to_combine
            else:
                vars_ls = [str(var) for var in self.variables_to_combine]

            self.combination_dict_ = {
                f"{operation}({'-'.join(vars_ls)})": operation  # type: ignore
                for operation in self.math_operations_
            }

        self.input_shape_ = X.shape

        return self
    def transform(self, X):
        """
        Replaces missing data with random values taken from the train set.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The dataframe to be transformed.

        Returns
        -------

        X_transformed : pandas dataframe of shape = [n_samples, n_features]
            The dataframe without missing values in the transformed variables.
        """

        # Check method fit has been called
        check_is_fitted(self)

        # check that input is a dataframe
        X = _is_dataframe(X)

        # Check that the dataframe contains the same number of columns than the dataframe
        # used to fit the imputer.
        _check_input_matches_training_df(X, self.input_shape_[1])

        # random sampling with a general seed
        if self.seed == 'general':
            for feature in self.variables:
                if X[feature].isnull().sum() > 0:
                    # determine number of data points to extract at random
                    n_samples = X[feature].isnull().sum()

                    # extract values
                    random_sample = self.X_[feature].dropna().sample(
                        n_samples,
                        replace=True,
                        random_state=self.random_state)
                    # re-index: pandas needs this to add values in the correct observations
                    random_sample.index = X[X[feature].isnull()].index

                    # replace na
                    X.loc[X[feature].isnull(), feature] = random_sample

        # random sampling observation per observation
        elif self.seed == 'observation':
            for feature in self.variables:
                if X[feature].isnull().sum() > 0:

                    # loop over each observation with missing data
                    for i in X[X[feature].isnull()].index:
                        # find the seed using additional variables
                        internal_seed = _define_seed(X,
                                                     i,
                                                     self.random_state,
                                                     how=self.seeding_method)

                        # extract 1 value at random
                        random_sample = self.X_[feature].dropna().sample(
                            1, replace=True, random_state=internal_seed)
                        random_sample = random_sample.values[0]

                        # replace the missing data point
                        X.loc[i, feature] = random_sample
        return X