Beispiel #1
0
def test_find_categorical_variables(dataframe_vartypes):
    vars_cat = ['Name', 'City']
    vars_mix = ['Age', 'Marks', 'Name']
    vars_none = None
    assert _find_categorical_variables(dataframe_vartypes,
                                       vars_cat) == vars_cat
    assert _find_categorical_variables(dataframe_vartypes,
                                       vars_none) == vars_cat
    with pytest.raises(TypeError):
        assert _find_categorical_variables(dataframe_vartypes, vars_mix)
def test_find_categorical_variables(df_vartypes):
    vars_cat = ["Name", "City"]
    vars_mix = ["Age", "Marks", "Name"]
    vars_none = None

    assert _find_categorical_variables(df_vartypes, vars_cat) == vars_cat
    assert _find_categorical_variables(df_vartypes, vars_none) == vars_cat

    with pytest.raises(TypeError):
        assert _find_categorical_variables(df_vartypes, vars_mix)

    with pytest.raises(ValueError):
        assert _find_categorical_variables(df_vartypes[["Age", "Marks"]], None)
    def fit(self, X, y=None):
        """
        Learns the frequent categories for each variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just selected variables

        y : None
            y is not required. You can pass y or None.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the frequent categories (that will be kept)
            for each variable. Categories not present in this list will be replaced
            by 'Rare' or by the user defined value.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        for var in self.variables:
            if len(X[var].unique()) > self.n_categories:

                # if the variable has more than the indicated number of categories
                # the encoder will learn the most frequent categories
                t = pd.Series(X[var].value_counts() / np.float(len(X)))

                # non-rare labels:
                self.encoder_dict_[var] = t[t >= self.tol].index

            else:
                # if the total number of categories is smaller than the indicated
                # the encoder will consider all categories as frequent.
                warnings.warn(
                    "The number of unique categories for variable {} is less than that indicated in "
                    "n_categories. Thus, all categories will be considered frequent"
                    .format(var))
                self.encoder_dict_[var] = X[var].unique()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Learns the unique categories per variable. If top_categories is indicated,
        it will learn the most popular categories. Alternatively, it learns all
        unique categories per variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just seleted variables.

        y : pandas series, default=None
            Target. It is not needed in this encoded. You can pass y or
            None.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the categories for which dummy variables
            will be created.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        for var in self.variables:
            if not self.top_categories:
                if self.drop_last:
                    category_ls = [x for x in X[var].unique()]
                    self.encoder_dict_[var] = category_ls[:-1]
                else:
                    self.encoder_dict_[var] = X[var].unique()

            else:
                self.encoder_dict_[var] = [
                    x for x in X[var].value_counts().sort_values(
                        ascending=False).head(self.top_categories).index
                ]

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """ Learns the numbers to be used to replace the categories in each
        variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to be
            encoded.

        y : pandas series, default=None
            The Target. Can be None if encoding_method = 'arbitrary'.
            Otherwise, y needs to be passed when fitting the transformer.
       
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # join target to predictor variables
        if self.encoding_method == 'ordered':
            if y is None:
                raise ValueError(
                    'Please provide a target y for this encoding method')

            temp = pd.concat([X, y], axis=1)
            temp.columns = list(X.columns) + ['target']

        # find mappings
        self.encoder_dict_ = {}

        for var in self.variables:

            if self.encoding_method == 'ordered':
                t = temp.groupby(
                    [var])['target'].mean().sort_values(ascending=True).index

            elif self.encoding_method == 'arbitrary':
                t = X[var].unique()

            self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)}

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
    def _check_fit_input_and_variables(self, X: pd.DataFrame) -> pd.DataFrame:
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check variables entered by user are object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        return X
Beispiel #7
0
    def fit(self, X, y=None):
        """
        Learns the most frequent category if the imputation method is set to frequent.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the selected variables.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary mapping each variable to the most frequent category, or to
            the value 'Missing' depending on the imputation_method. The most frequent
            category is calculated when fitting the transformer.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for categorical variables
        self.variables = _find_categorical_variables(X, self.variables)

        if self.imputation_method == "missing":
            self.imputer_dict_ = {
                var: self.fill_value
                for var in self.variables
            }

        elif self.imputation_method == "frequent":
            self.imputer_dict_ = {}

            for var in self.variables:
                mode_vals = X[var].mode()

                # careful: some variables contain multiple modes
                if len(mode_vals) == 1:
                    self.imputer_dict_[var] = mode_vals[0]
                else:
                    raise ValueError(
                        "Variable {} contains multiple frequent categories.".
                        format(var))

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y=None):
        """
        Learns the counts or frequencies which will be used to replace the categories.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            The user can pass the entire dataframe.

        y : None
            y is not needed in this encoder. You can pass y or None.

        Attributes
        ----------

        encoder_dict_: dictionary
            Dictionary containing the {category: count / frequency} pairs for
            each variable.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        self.encoder_dict_ = {}

        # learn encoding maps
        for var in self.variables:
            if self.encoding_method == 'count':
                self.encoder_dict_[var] = X[var].value_counts().to_dict()

            elif self.encoding_method == 'frequency':
                n_obs = np.float(len(X))
                self.encoder_dict_[var] = (X[var].value_counts() /
                                           n_obs).to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y):
        """
        Learns the mean value of the target for each category of the variable.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the variables to be encoded.

        y : pandas series
            The target.

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the {category: target mean} pairs used
            to replace categories in every variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        if y is None:
            raise ValueError(
                'Please provide a target y for this encoding method')

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ['target']

        self.encoder_dict_ = {}

        for var in self.variables:
            self.encoder_dict_[var] = temp.groupby(
                var)['target'].mean().to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self
Beispiel #10
0
    def fit(self, X, y=None):
        """
        Learns the numbers that should be used to replace the categories in each
        variable.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            The target variable. Required to train the decision tree and for
            ordered ordinal encoding.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        # initialize categorical encoder
        cat_encoder = OrdinalCategoricalEncoder(encoding_method=self.encoding_method,
                                                variables=self.variables)

        # initialize decision tree discretiser
        tree_discretiser = DecisionTreeDiscretiser(cv=self.cv, scoring=self.scoring,
                                                   variables=self.variables, param_grid=self.param_grid,
                                                   regression=self.regression, random_state=self.random_state)

        # pipeline for the encoder
        self.encoder_ = Pipeline([('categorical_encoder', cat_encoder),
                                  ('tree_discretiser', tree_discretiser)])

        self.encoder_.fit(X, y)

        self.input_shape_ = X.shape

        return self
    def fit(self, X, y):
        """
        Learns the numbers that should be used to replace the categories in each
        variable. That is the WoE or ratio of probability.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            Target, must be binary [0,1].

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the {category: WoE / ratio} pairs per variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find categorical variables or check that those entered by the user
        # are of type object
        self.variables = _find_categorical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        if y is None:
            raise ValueError(
                'Please provide a target y for this encoding method')

        # check that y is binary
        if len([x for x in y.unique() if x not in [0, 1]]) > 0:
            raise ValueError(
                "This encoder is only designed for binary classification, values of y can be only 0 or 1"
            )

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ['target']

        self.encoder_dict_ = {}

        if self.encoding_method == 'woe':
            total_pos = temp['target'].sum()
            total_neg = len(temp) - total_pos
            temp['non_target'] = np.where(temp['target'] == 1, 0, 1)

            for var in self.variables:
                pos = temp.groupby([var])['target'].sum() / total_pos
                neg = temp.groupby([var])['non_target'].sum() / total_neg

                t = pd.concat([pos, neg], axis=1)
                t['woe'] = np.log(t['target'] / t['non_target'])

                if not t.loc[t['target'] == 0, :].empty or not t.loc[
                        t['non_target'] == 0, :].empty:
                    raise ValueError(
                        "The proportion of 1 of the classes for a category in variable {} is zero, and log of zero is "
                        "not defined".format(var))

                self.encoder_dict_[var] = t['woe'].to_dict()

        else:
            for var in self.variables:
                t = temp.groupby(var)['target'].mean()
                t = pd.concat([t, 1 - t], axis=1)
                t.columns = ['p1', 'p0']

                if self.encoding_method == 'log_ratio':
                    if not t.loc[t['p0'] == 0, :].empty or not t.loc[
                            t['p1'] == 0, :].empty:
                        raise ValueError(
                            "p(0) or p(1) for a category in variable {} is zero, log of zero is not defined"
                            .format(var))
                    else:
                        self.encoder_dict_[var] = (np.log(t.p1 /
                                                          t.p0)).to_dict()

                elif self.encoding_method == 'ratio':
                    if not t.loc[t['p0'] == 0, :].empty:
                        raise ValueError(
                            "p(0) for a category in variable {} is zero, division by 0 is not defined"
                            .format(var))
                    else:
                        self.encoder_dict_[var] = (t.p1 / t.p0).to_dict()

        self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_)

        self.input_shape_ = X.shape

        return self