def transform(self, X): """ Groups rare labels under separate group 'Rare' or any other name provided by the user. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe where rare categories have been grouped. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) for feature in self.variables: X[feature] = np.where(X[feature].isin(self.encoder_dict_[feature]), X[feature], self.replace_with) return X
def transform(self, X): """ Returns the predictions of the decision tree based of the variable's original value. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. Dataframe with variables encoded with decision tree predictions. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) X = self.encoder_.transform(X) return X
def fit(self, X, y=None): """ Learns the numbers to be used to replace the categories in each variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y : pandas series, default=None The Target. Can be None if encoding_method = 'arbitrary'. Otherwise, y needs to be passed when fitting the transformer. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) # join target to predictor variables if self.encoding_method == 'ordered': if y is None: raise ValueError( 'Please provide a target y for this encoding method') temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ['target'] # find mappings self.encoder_dict_ = {} for var in self.variables: if self.encoding_method == 'ordered': t = temp.groupby( [var])['target'].mean().sort_values(ascending=True).index elif self.encoding_method == 'arbitrary': t = X[var].unique() self.encoder_dict_[var] = {k: i for i, k in enumerate(t, 0)} self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the unique categories per variable. If top_categories is indicated, it will learn the most popular categories. Alternatively, it learns all unique categories per variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just seleted variables. y : pandas series, default=None Target. It is not needed in this encoded. You can pass y or None. Attributes ---------- encoder_dict_: dictionary The dictionary containing the categories for which dummy variables will be created. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) self.encoder_dict_ = {} for var in self.variables: if not self.top_categories: if self.drop_last: category_ls = [x for x in X[var].unique()] self.encoder_dict_[var] = category_ls[:-1] else: self.encoder_dict_[var] = X[var].unique() else: self.encoder_dict_[var] = [ x for x in X[var].value_counts().sort_values( ascending=False).head(self.top_categories).index ] self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def fit(self, X, y=None): # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) return X
def fit(self, X, y): """ Learns the mean value of the target for each category of the variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the variables to be encoded. y : pandas series The target. Attributes ---------- encoder_dict_: dictionary The dictionary containing the {category: target mean} pairs used to replace categories in every variable. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) if y is None: raise ValueError( 'Please provide a target y for this encoding method') temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ['target'] self.encoder_dict_ = {} for var in self.variables: self.encoder_dict_[var] = temp.groupby( var)['target'].mean().to_dict() self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the counts or frequencies which will be used to replace the categories. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. The user can pass the entire dataframe. y : None y is not needed in this encoder. You can pass y or None. Attributes ---------- encoder_dict_: dictionary Dictionary containing the {category: count / frequency} pairs for each variable. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) self.encoder_dict_ = {} # learn encoding maps for var in self.variables: if self.encoding_method == 'count': self.encoder_dict_[var] = X[var].value_counts().to_dict() elif self.encoding_method == 'frequency': n_obs = np.float(len(X)) self.encoder_dict_[var] = (X[var].value_counts() / n_obs).to_dict() self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def transform(self, X): # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) return X
def fit(self, X, y=None): """ Learns the numbers that should be used to replace the categories in each variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. The target variable. Required to train the decision tree and for ordered ordinal encoding. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) # initialize categorical encoder cat_encoder = OrdinalCategoricalEncoder( encoding_method=self.encoding_method, variables=self.variables) # initialize decision tree discretiser tree_discretiser = DecisionTreeDiscretiser( cv=self.cv, scoring=self.scoring, variables=self.variables, param_grid=self.param_grid, regression=self.regression, random_state=self.random_state) # pipeline for the encoder self.encoder_ = Pipeline([('categorical_encoder', cat_encoder), ('tree_discretiser', tree_discretiser)]) self.encoder_.fit(X, y) self.input_shape_ = X.shape return self
def transform(self, X): """ Replaces categories with the learned parameters. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features]. The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. The dataframe containing categories replaced by numbers. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace categories by the learned parameters for feature in self.encoder_dict_.keys(): X[feature] = X[feature].map(self.encoder_dict_[feature]) # check if NaN values were introduced by the encoding if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0: warnings.warn( "NaN values were introduced in the returned dataframe by the encoder." "This means that some of the categories in the input dataframe were " "not present in the training set used when the fit method was called. " "Thus, mappings for those categories does not exist. Try using the " "RareLabelCategoricalEncoder to remove infrequent categories before " "calling this encoder." ) return X
def fit(self, X, y=None): """ Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : None y is not needed in this transformer. You can pass y or None. Attributes ---------- right_tail_caps_: dictionary The dictionary containing the maximum values at which variables will be capped. left_tail_caps_ : dictionary The dictionary containing the minimum values at which variables will be capped. """ X = _is_dataframe(X) if self.missing_values == 'raise': # check if dataset contains na _check_contains_na(X, self.variables) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) if self.max_capping_dict is not None: self.right_tail_caps_ = self.max_capping_dict else: self.right_tail_caps_ = {} if self.min_capping_dict is not None: self.left_tail_caps_ = self.min_capping_dict else: self.left_tail_caps_ = {} self.input_shape_ = X.shape return self
def transform(self, X): """ Caps the variable values, that is, censors outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe with the capped variables. """ # check if class was fitted check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) if self.missing_values == 'raise': # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace outliers for feature in self.right_tail_caps_.keys(): X[feature] = np.where(X[feature] > self.right_tail_caps_[feature], self.right_tail_caps_[feature], X[feature]) for feature in self.left_tail_caps_.keys(): X[feature] = np.where(X[feature] < self.left_tail_caps_[feature], self.left_tail_caps_[feature], X[feature]) return X
def transform(self, X): """ Removes observations with outliers from the dataframe. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe without outlier observations. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) if self.missing_values == 'raise': # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) for feature in self.right_tail_caps_.keys(): outliers = np.where(X[feature] > self.right_tail_caps_[feature], True, False) X = X.loc[~outliers] for feature in self.left_tail_caps_.keys(): outliers = np.where(X[feature] < self.left_tail_caps_[feature], True, False) X = X.loc[~outliers] return X
def transform(self, X): """ Creates the dummy / binary variables. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_transformed : pandas dataframe. The shape of the dataframe will be different from the original as it includes the dummy variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) for feature in self.variables: for category in self.encoder_dict_[feature]: X[str(feature) + '_' + str(category)] = np.where( X[feature] == category, 1, 0) # drop the original non-encoded variables. X.drop(labels=self.variables, axis=1, inplace=True) return X
def inverse_transform(self, X): """ Convert the data back to the original representation. Parameters ---------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. The transformed dataframe. Returns ------- X : pandas dataframe of shape = [n_samples, n_features]. The un-transformed dataframe, that is, containing the original values of the categorical variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace encoded categories by the original values for feature in self.encoder_dict_.keys(): inv_map = {v: k for k, v in self.encoder_dict_[feature].items()} X[feature] = X[feature].map(inv_map) return X
def fit(self, X, y=None): """ Learns the values that should be used to replace outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. y : None y is not needed in this transformer. You can pass y or None. Attributes ---------- right_tail_caps_: dictionary The dictionary containing the maximum values at which variables will be capped. left_tail_caps_ : dictionary The dictionary containing the minimum values at which variables will be capped. """ # check input dataframe X = _is_dataframe(X) # find or check for numerical variables self.variables = _find_numerical_variables(X, self.variables) if self.missing_values == 'raise': # check if dataset contains na _check_contains_na(X, self.variables) self.right_tail_caps_ = {} self.left_tail_caps_ = {} # estimate the end values if self.tail in ['right', 'both']: if self.distribution == 'gaussian': self.right_tail_caps_ = ( X[self.variables].mean() + self.fold * X[self.variables].std()).to_dict() elif self.distribution == 'skewed': IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) self.right_tail_caps_ = (X[self.variables].quantile(0.75) + (IQR * self.fold)).to_dict() elif self.distribution == 'quantiles': self.right_tail_caps_ = X[self.variables].quantile( 1 - self.fold).to_dict() if self.tail in ['left', 'both']: if self.distribution == 'gaussian': self.left_tail_caps_ = ( X[self.variables].mean() - self.fold * X[self.variables].std()).to_dict() elif self.distribution == 'skewed': IQR = X[self.variables].quantile(0.75) - X[ self.variables].quantile(0.25) self.left_tail_caps_ = (X[self.variables].quantile(0.25) - (IQR * self.fold)).to_dict() elif self.distribution == 'quantiles': self.left_tail_caps_ = X[self.variables].quantile( self.fold).to_dict() self.input_shape_ = X.shape return self
def fit(self, X, y=None): """ Learns the frequent categories for each variable. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just selected variables y : None y is not required. You can pass y or None. Attributes ---------- encoder_dict_: dictionary The dictionary containing the frequent categories (that will be kept) for each variable. Categories not present in this list will be replaced by 'Rare' or by the user defined value. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) self.encoder_dict_ = {} for var in self.variables: if len(X[var].unique()) > self.n_categories: # if the variable has more than the indicated number of categories # the encoder will learn the most frequent categories t = pd.Series(X[var].value_counts() / np.float(len(X))) # non-rare labels: freq_idx = t[t >= self.tol].index if self.max_n_categories: self.encoder_dict_[var] = freq_idx[:self.max_n_categories] else: self.encoder_dict_[var] = freq_idx else: # if the total number of categories is smaller than the indicated # the encoder will consider all categories as frequent. warnings.warn( "The number of unique categories for variable {} is less than that indicated in " "n_categories. Thus, all categories will be considered frequent" .format(var)) self.encoder_dict_[var] = X[var].unique() self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self
def fit(self, X, y): """ Learns the numbers that should be used to replace the categories in each variable. That is the WoE or ratio of probability. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. Target, must be binary [0,1]. Attributes ---------- encoder_dict_: dictionary The dictionary containing the {category: WoE / ratio} pairs per variable. """ # check input dataframe X = _is_dataframe(X) # find categorical variables or check that those entered by the user # are of type object self.variables = _find_categorical_variables(X, self.variables) # check if dataset contains na _check_contains_na(X, self.variables) if y is None: raise ValueError( 'Please provide a target y for this encoding method') # check that y is binary if len([x for x in y.unique() if x not in [0, 1]]) > 0: raise ValueError( "This encoder is only designed for binary classification, values of y can be only 0 or 1" ) temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ['target'] self.encoder_dict_ = {} if self.encoding_method == 'woe': total_pos = temp['target'].sum() total_neg = len(temp) - total_pos temp['non_target'] = np.where(temp['target'] == 1, 0, 1) for var in self.variables: pos = temp.groupby([var])['target'].sum() / total_pos neg = temp.groupby([var])['non_target'].sum() / total_neg t = pd.concat([pos, neg], axis=1) t['woe'] = np.log(t['target'] / t['non_target']) if not t.loc[t['target'] == 0, :].empty or not t.loc[ t['non_target'] == 0, :].empty: raise ValueError( "The proportion of 1 of the classes for a category in variable {} is zero, and log of zero is " "not defined".format(var)) self.encoder_dict_[var] = t['woe'].to_dict() else: for var in self.variables: t = temp.groupby(var)['target'].mean() t = pd.concat([t, 1 - t], axis=1) t.columns = ['p1', 'p0'] if self.encoding_method == 'log_ratio': if not t.loc[t['p0'] == 0, :].empty or not t.loc[ t['p1'] == 0, :].empty: raise ValueError( "p(0) or p(1) for a category in variable {} is zero, log of zero is not defined" .format(var)) else: self.encoder_dict_[var] = (np.log(t.p1 / t.p0)).to_dict() elif self.encoding_method == 'ratio': if not t.loc[t['p0'] == 0, :].empty: raise ValueError( "p(0) for a category in variable {} is zero, division by 0 is not defined" .format(var)) else: self.encoder_dict_[var] = (t.p1 / t.p0).to_dict() self.encoder_dict_ = _check_encoding_dictionary(self.encoder_dict_) self.input_shape_ = X.shape return self