def test_find_all_variables(df_vartypes):
    all_vars = ["Name", "City", "Age", "Marks", "dob"]
    user_vars = ["Name", "City"]
    non_existing_vars = ["Grades"]

    assert _find_all_variables(df_vartypes) == all_vars
    assert _find_all_variables(df_vartypes, ["Name", "City"]) == user_vars

    with pytest.raises(TypeError):
        assert _find_all_variables(df_vartypes, non_existing_vars)
def test_find_all_variables(df_vartypes):
    all_vars = ["Name", "City", "Age", "Marks", "dob"]
    all_vars_no_dt = ["Name", "City", "Age", "Marks"]
    user_vars = ["Name", "City"]
    non_existing_vars = ["Grades"]

    assert _find_all_variables(df_vartypes) == all_vars
    assert _find_all_variables(df_vartypes,
                               exclude_datetime=True) == all_vars_no_dt
    assert _find_all_variables(df_vartypes, ["Name", "City"]) == user_vars

    with pytest.raises(KeyError):
        assert _find_all_variables(df_vartypes, non_existing_vars)
Ejemplo n.º 3
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the variables for which the missing indicators will be created.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables for which indicator should be added
        self.variables_ = _find_all_variables(X, self.variables)

        if self.missing_only is True:
            self.variables_ = [
                var for var in self.variables_ if X[var].isnull().sum() > 0
            ]

        self._get_feature_names_in(X)

        return self
Ejemplo n.º 4
0
    def _check_or_select_variables(self, X: pd.DataFrame):
        """
        Finds categorical variables, or alternatively checks that the variables
        entered by the user are of type object (categorical).
        Checks absence of NA.

        Parameters
        ----------
        X: Pandas DataFrame

        Raises
        ------
        TypeError
            If any user provided variable is not categorical
        ValueError
            If there are no categorical variables in the df or the df is empty
            If the variable(s) contain null values
        """
        if not self.ignore_format:
            # find categorical variables or check variables entered by user are object
            self.variables_: List[Union[
                str, int]] = _find_or_check_categorical_variables(
                    X, self.variables)
        else:
            # select all variables or check variables entered by the user
            self.variables_ = _find_all_variables(X, self.variables)
Ejemplo n.º 5
0
    def fit(self, X, y=None):
        """
        The `fit` method allows Scikit-learn transformers to learn the required parameters
        from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated
        in the variables parameter will be transformed. When the variables parameter is None, the
        SklearnWrapper will automatically select and transform all features in the dataset,
        numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables will be transformed.
        The SklearnWrapper will check that the variables indicated in the variables parameter
        are numerical, or alternatively, if variables is None, it will automatically select
        the numerical variables in the data set.
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 6
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Find the variables for which missing data should be evaluated to decide if a
        row should be dropped.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training data set.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables for which indicator should be added
        self.variables_ = _find_all_variables(X, self.variables)

        # If user passes a threshold, then missing_only is ignored:
        if self.threshold is None and self.missing_only is True:
            self.variables_ = [
                var for var in self.variables_ if X[var].isnull().sum() > 0
            ]

        self._get_feature_names_in(X)

        return self
Ejemplo n.º 7
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find constant and quasi-constant features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.
        y : None
            y is not needed for this transformer. You can pass y or None.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all variables or check those entered are present in the dataframe
        self.variables = _find_all_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        if self.missing_values == "include":
            X[self.variables] = X[self.variables].fillna("missing_values")

        # find constant features
        if self.tol == 1:
            self.features_to_drop_ = [
                feature for feature in self.variables
                if X[feature].nunique() == 1
            ]

        # find constant and quasi-constant features
        else:
            self.features_to_drop_ = []

            for feature in self.variables:
                # find most frequent value / category in the variable
                predominant = ((X[feature].value_counts() /
                                np.float(len(X))).sort_values(
                                    ascending=False).values[0])

                if predominant >= self.tol:
                    self.features_to_drop_.append(feature)

        # check we are not dropping all the columns in the df
        if len(self.features_to_drop_) == len(X.columns):
            raise ValueError(
                "The resulting dataframe will have no columns after dropping all "
                "constant or quasi-constant features. Try changing the tol value."
            )

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 8
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Learn the most frequent category if the imputation method is set to frequent.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: pandas Series, default=None
            y is not needed in this imputation. You can pass None or y.

        Raises
        ------
        TypeError
            - If the input is not a Pandas DataFrame.
            - If user enters non-categorical variables (unless ignore_format is True)
        ValueError
            If there are no categorical variables in the df or the df is empty

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # check or select the the right variables
        if not self.ignore_format:
            # find categorical variables or check variables entered by user are
            # categorical
            self.variables_: List[
                Union[str, int]
            ] = _find_or_check_categorical_variables(X, self.variables)
        else:
            # select all variables or check variables entered by the user
            self.variables_ = _find_all_variables(X, self.variables)

        if self.imputation_method == "missing":
            self.imputer_dict_ = {var: self.fill_value for var in self.variables_}

        elif self.imputation_method == "frequent":
            self.imputer_dict_ = {}

            for var in self.variables_:
                mode_vals = X[var].mode()

                # careful: some variables contain multiple modes
                if len(mode_vals) == 1:
                    self.imputer_dict_[var] = mode_vals[0]
                else:
                    raise ValueError(
                        "Variable {} contains multiple frequent categories.".format(var)
                    )

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 9
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        Fits the Scikit-learn transformer to the selected variables.

        If you enter None in the variables parameter, all variables will be
        automatically transformed by the OneHotEncoder, OrdinalEncoder or
        SimpleImputer. For the rest of the transformers, only the numerical variables
        will be selected and transformed.

        If you enter a list in the variables attribute, the SklearnTransformerWrapper
        will check that those variables exist in the dataframe and are of type
        numeric for all transformers except the OneHotEncoder, OrdinalEncoder or
        SimpleImputer, which also accept categorical variables.

        Parameters
        ----------
        X: Pandas DataFrame
            The dataset to fit the transformer
        y: pandas Series, default=None
            The target variable.

        Raises
        ------
         TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        self.transformer_ = clone(self.transformer)

        if (self.transformer_.__class__.__name__ == "OneHotEncoder"
                and self.transformer_.sparse):
            raise AttributeError(
                "The SklearnTransformerWrapper can only wrap the OneHotEncoder if you "
                "set its sparse attribute to False")

        if self.transformer_.__class__.__name__ in [
                "OneHotEncoder",
                "OrdinalEncoder",
                "SimpleImputer",
        ]:
            self.variables_ = _find_all_variables(X, self.variables)

        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer_.fit(X[self.variables_], y)

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 10
0
    def fit(self, X, y=None):

        """
        Find constant and quasi-constant features.

        Parameters
        ----------

        X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.

        y: None
            y is not needed for this transformer. You can pass y or None.


        Attributes
        ----------

        constant_features_: list
            The list of constant and quasi-constant features.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all variables or check those entered are present in the dataframe
        self.variables = _find_all_variables(X, self.variables)

        # find constant and quasi-constant
        self.constant_features_ = []

        for feature in self.variables:

            predominant = (
                (X[feature].value_counts() / np.float(len(X)))
                .sort_values(ascending=False)
                .values[0]
            )

            if predominant >= self.tol:
                self.constant_features_.append(feature)

        # if total constant features is equal to total features raise an error
        if len(self.constant_features_) == len(X.columns):
            raise ValueError(
                "The resulting dataframe will have no columns after dropping all "
                "constant features."
            )

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 11
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        The `fit` method allows Scikit-learn transformers to learn the required
        parameters from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer,
        all variables indicated in the ```variables``` parameter will be transformed.
        When the variables parameter is None, the SklearnWrapper will automatically
        select and transform all features in the dataset, numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables will be
        transformed. The SklearnWrapper will check that the variables indicated in the
        variables parameter are numerical, or alternatively, if variables is None, it
        will automatically select the numerical variables in the data set.

        Parameters
        ----------
        X : Pandas DataFrame
            The dataset to fit the transformer
        y : pandas Series, default=None
            This parameter exists only for compatibility with sklearn.pipeline.Pipeline.

        Raises
        ------
         TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer,
                      (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)

        else:
            self.variables = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 12
0
    def fit(self, X: pd.DataFrame, y: Optional[str] = None):
        """
        Fits the Scikit-learn transformer to the selected variables.

        Parameters
        ----------
        X: Pandas DataFrame
            The dataset to fit the transformer.

        y: pandas Series, default=None
            The target variable.
        """

        # check input dataframe
        X = check_X(X)

        self.transformer_ = clone(self.transformer)

        if self.transformer_.__class__.__name__ in [
                "OneHotEncoder",
                "OrdinalEncoder",
                "SimpleImputer",
                "FunctionTransformer",
        ]:
            self.variables_ = _find_all_variables(X, self.variables)

        else:
            self.variables_ = _find_or_check_numerical_variables(
                X, self.variables)

        self.transformer_.fit(X[self.variables_], y)

        if self.transformer_.__class__.__name__ in _SELECTORS:
            # Find features to drop.
            selected = X[self.variables_].columns[
                self.transformer_.get_support()]
            self.features_to_drop_ = [
                f for f in self.variables_ if f not in selected
            ]

        # save input features
        self.feature_names_in_ = X.columns.tolist()

        self.n_features_in_ = X.shape[1]

        return self
Ejemplo n.º 13
0
    def _check_fit_input_and_variables(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Checks that input is a dataframe, finds categorical variables, or alternatively
        checks that the variables entered by the user are of type object (categorical).
        Checks absence of NA.

        Parameters
        ----------
        X: Pandas DataFrame

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame.
            If any user provided variable is not categorical
        ValueError
            If there are no categorical variables in the df or the df is empty
            If the variable(s) contain null values

        Returns
        -------
        X: Pandas DataFrame
            The same dataframe entered as parameter
        variables : list
            list of categorical variables
        """

        # check input dataframe
        X = _is_dataframe(X)

        if not self.ignore_format:
            # find categorical variables or check variables entered by user are object
            self.variables_: List[
                Union[str, int]
            ] = _find_or_check_categorical_variables(X, self.variables)
        else:
            # select all variables or check variables entered by the user
            self.variables_ = _find_all_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables_)

        return X
Ejemplo n.º 14
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Makes a copy of the train set. Only stores a copy of the variables to impute.
        This copy is then used to randomly extract the values to fill the missing data
        during transform.

        Parameters
        ----------

        X: pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y: None
            y is not needed in this imputation. You can pass None or y.
        """

        # check input dataframe
        X = check_X(X)

        # find variables to impute
        self.variables_ = _find_all_variables(X, self.variables)

        # take a copy of the selected variables
        self.X_ = X[self.variables_].copy()

        # check the variables assigned to the random state
        if self.seed == "observation":
            self.random_state = _check_input_parameter_variables(
                self.random_state)
            if isinstance(self.random_state, (int, str)):
                self.random_state = [self.random_state]
            if self.random_state and any(
                    var for var in self.random_state if var not in X.columns):
                raise ValueError(
                    "There are variables assigned as random state which are not part "
                    "of the training dataframe.")

        self._get_feature_names_in(X)

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
           The input dataframe

        y : array-like of shape (n_samples)
           Target variable. Required to train the estimator.

        Returns
        -------
        self
        """
        # check input dataframe
        X = _is_dataframe(X)

        # check variables
        self.variables = _find_all_variables(X, self.variables)

        # check if df contains na
        _check_contains_na(X, self.variables)

        # limit df to variables to smooth code below
        X = X[self.variables].copy()

        # find categorical and numerical variables
        self.variables_categorical_ = list(X.select_dtypes(include="O").columns)
        self.variables_numerical_ = list(
            X.select_dtypes(include=["float", "integer"]).columns
        )

        # obtain cross-validation indeces
        skf = StratifiedKFold(
            n_splits=self.cv, shuffle=True, random_state=self.random_state
        )
        skf.get_n_splits(X, y)

        if self.variables_categorical_ and self.variables_numerical_:
            _pipeline = self._make_combined_pipeline()

        elif self.variables_categorical_:
            _pipeline = self._make_categorical_pipeline()

        else:
            _pipeline = self._make_numerical_pipeline()

        # obtain feature performance with cross-validation
        feature_importances_cv = []

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            _pipeline.fit(X_train, y_train)

            X_test = _pipeline.transform(X_test)

            if self.scoring == "roc_auc_score":
                tmp_split = {
                    f: roc_auc_score(y_test, X_test[f]) for f in self.variables
                }
            else:
                tmp_split = {f: r2_score(y_test, X_test[f]) for f in self.variables}

            feature_importances_cv.append(pd.Series(tmp_split))

        feature_importances_cv = pd.concat(feature_importances_cv, axis=1)

        self.feature_performance_ = feature_importances_cv.mean(  # type: ignore
            axis=1
        ).to_dict()

        self.features_to_drop_ = [
            f
            for f in self.variables
            if self.feature_performance_[f] < self.threshold
        ]

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 16
0
    def fit(self, X, y=None):
        """
        Find duplicated features.

        Parameters
        ----------

        X: pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.

        y: None
            y is not needed for this transformer. You can pass y or None.


        Attributes
        ----------

        duplicated_features_: set
            The duplicated features.

        duplicated_feature_sets_: list
            Groups of duplicated features. Or in other words, features that are
            duplicated with each other. Each list represents a group of duplicated
            features.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all variables or check those entered are in the dataframe
        self.variables = _find_all_variables(X, self.variables)

        # create tuples of duplicated feature groups
        self.duplicated_feature_sets_ = []

        # set to collect features that are duplicated
        self.duplicated_features_ = set()

        # create set of examined features
        _examined_features = set()

        for feature in self.variables:

            # append so we can remove when we create the combinations
            _examined_features.add(feature)

            if feature not in self.duplicated_features_:

                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found duplicates
                _features_to_compare = [
                    f for f in self.variables if f not in
                    _examined_features.union(self.duplicated_features_)
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    if X[feature].equals(X[f2]):
                        self.duplicated_features_.add(f2)
                        _temp_set.add(f2)

                # if there are duplicated features
                if len(_temp_set) > 1:
                    self.duplicated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 17
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Find the important features.

        Parameters
        ----------
        X: pandas dataframe of shape = [n_samples, n_features]
           The input dataframe.

        y: array-like of shape (n_samples)
           Target variable. Required to train the estimator.
        """
        # check input dataframe
        X, y = check_X_y(X, y)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find all variables or check those entered are present in the dataframe
        self.variables_ = _find_all_variables(X, self.variables_, exclude_datetime=True)

        if len(self.variables_) == 1 and self.threshold is None:
            raise ValueError(
                "When evaluating a single feature you need to manually set a value "
                "for the threshold. "
                f"The transformer is evaluating the performance of {self.variables_} "
                f"and the threshold was left to {self.threshold} when initializing "
                f"the transformer."
            )

        # save input features
        self._get_feature_names_in(X)

        # set up the correct estimator
        if self.regression is True:
            est = TargetMeanRegressor(
                bins=self.bins,
                strategy=self.strategy,
            )
        else:
            est = TargetMeanClassifier(
                bins=self.bins,
                strategy=self.strategy,
            )

        self.feature_performance_ = {}

        for variable in self.variables_:
            # clone estimator
            estimator = clone(est)

            # set the estimator to evaluate the required variable
            estimator.set_params(variables=variable)

            model = cross_validate(
                estimator,
                X,
                y,
                cv=self.cv,
                scoring=self.scoring,
            )

            self.feature_performance_[variable] = model["test_score"].mean()

        # select features
        if not self.threshold:
            threshold = pd.Series(self.feature_performance_).mean()
        else:
            threshold = self.threshold

        self.features_to_drop_ = [
            f for f in self.variables_ if self.feature_performance_[f] < threshold
        ]

        return self
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find duplicated features.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The input dataframe.
        y : None
            y is not needed for this transformer. You can pass y or None.

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find all variables or check those entered are in the dataframe
        self.variables = _find_all_variables(X, self.variables)

        if self.missing_values == "raise":
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # create tuples of duplicated feature groups
        self.duplicated_feature_sets_ = []

        # set to collect features that are duplicated
        self.features_to_drop_ = set()  # type: ignore

        # create set of examined features
        _examined_features = set()

        for feature in self.variables:

            # append so we can remove when we create the combinations
            _examined_features.add(feature)

            if feature not in self.features_to_drop_:

                _temp_set = set([feature])

                # features that have not been examined, are not currently examined and
                # were not found duplicates
                _features_to_compare = [
                    f for f in self.variables if f not in
                    _examined_features.union(self.features_to_drop_)
                ]

                # create combinations:
                for f2 in _features_to_compare:

                    if X[feature].equals(X[f2]):
                        self.features_to_drop_.add(f2)
                        _temp_set.add(f2)

                # if there are duplicated features
                if len(_temp_set) > 1:
                    self.duplicated_feature_sets_.append(_temp_set)

        self.input_shape_ = X.shape

        return self