def transform(self, X): """ Returns the predictions of the decision tree based of the variable's original value. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. Dataframe with variables encoded with decision tree predictions. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) X = self.encoder_.transform(X) return X
def transform(self, X): """ Drops the variable or list of variables indicated by the user from the original dataframe and returns a new dataframe with the remaining subset of variables. Parameters ---------- X: pandas dataframe The input dataframe from which features will be dropped Returns ------- X_transformed: pandas dataframe, shape = [n_samples, n_features - len(features_to_drop)] The transformed dataframe with the remaining subset of variables. """ # check if fit is called prior check_is_fitted(self) # check input dataframe X = _is_dataframe(X) # check for input consistency _check_input_matches_training_df(X, self.input_shape_[1]) X = X.drop(columns=self.features_to_drop) return X
def transform(self, X: pd.DataFrame): """ Return dataframe with selected features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features]. The input dataframe. Returns ------- X_transformed: pandas dataframe of shape = [n_samples, n_selected_features] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) # return the dataframe with the selected features return X.drop(columns=self.features_to_drop_)
def transform(self, X): """ Groups rare labels under separate group 'Rare' or any other name provided by the user. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe where rare categories have been grouped. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) for feature in self.variables: X[feature] = np.where(X[feature].isin(self.encoder_dict_[feature]), X[feature], self.replace_with) return X
def transform(self, X): """ Removes non-selected features. That is, features which did not cause a big estimator performance drop when removed from the dataset. Args ---- X: pandas dataframe of shape = [n_samples, n_features]. The input dataframe from which features will be selected. Returns ------- X_transformed: pandas dataframe of shape = [n_samples, n_selected_features] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) # return the dataframe with the selected features return X[self.selected_features_]
def transform(self, X): """ Replaces missing data with the learned parameters. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe without missing values in the selected variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check that input data contains same number of columns than # the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replaces missing data with the learned parameters for variable in self.imputer_dict_: X[variable].fillna(self.imputer_dict_[variable], inplace=True) return X
def transform(self, X): """ Adds the binary missing indicators. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The dataframe to be transformed. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe containing the additional binary variables. Binary variables are named with the original variable name plus '_na'. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) X = X.copy() for feature in self.variables_: X[feature + '_na'] = np.where(X[feature].isnull(), 1, 0) return X
def transform(self, X): """ Removes non-selected features. That is, features which shuffling did not decrease the machine learning model performance beyond the indicated threshold. Args ---- X: pandas dataframe of shape = [n_samples, n_features]. The input dataframe from which feature values will be shuffled. Returns ------- X_transformed: pandas dataframe of shape = [n_samples, n_features - len(dropped features)] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # reset the index X = X.reset_index(drop=True) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) return X[self.selected_features_]
def transform(self, X): """ Drops the correlated features from a dataframe. Args: X: pandas dataframe of shape = [n_samples, n_features]. The input samples. Returns: X_transformed: pandas dataframe shape = [n_samples, n_features - (correlated features)] The transformed dataframe with the remaining subset of variables. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) # returned non-duplicate features X = X.drop(columns=self.correlated_features_) return X
def transform(self, X): """ Drops the constant and quasi-constant features from a dataframe. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features]. The input samples. Returns ------- X_transformed: pandas dataframe of shape = [n_samples, n_features - (constant_features+quasi constant features)] The transformed dataframe with the remaining subset of variables. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) # returned selected features X = X.drop(columns=self.constant_features_) return X
def transform(self, X): """ Drop the correlated features from a dataframe. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features]. The input samples. Returns ------- X_transformed : pandas dataframe shape = [n_samples, n_features - (correlated features)] The transformed dataframe with the remaining subset of variables. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # returned non-correlated features X = X.drop(columns=self.correlated_features_) return X
def transform(self, X): """ Apply the transformation to the dataframe. Only the selected features will be modified. If transformer is OneHotEncoder, dummy features are concatenated to the source dataset. Note that the original categorical variables will not be removed from the dataset after encoding. If this is the desired effect, please use Feature-engine's OneHotCategoricalEncoder instead. """ # check that input is a dataframe X = _is_dataframe(X) # Check that input data contains same number of columns than # the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) if isinstance(self.transformer, OneHotEncoder): ohe_results_as_df = pd.DataFrame( data=self.transformer.transform(X[self.variables]), columns=self.transformer.get_feature_names(self.variables) ) X = pd.concat([X, ohe_results_as_df], axis=1) else: X[self.variables] = self.transformer.transform(X[self.variables]) return X
def transform(self, X): """ Removes non-selected features. Args ---- X: pandas dataframe of shape = [n_samples, n_features]. The input dataframe from which feature values will be train. Returns ------- X_transformed: pandas dataframe of shape = [n_samples, selected_features] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) return X[self.selected_features_]
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """ Check that the input is a dataframe and of the same size than the one used in the fit method. Checks absence of NA. Parameters ---------- X: Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame ValueError If the dataframe is not of same size as that used in fit() Returns ------- X: Pandas DataFrame The same dataframe entered by the user. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check that input df contains same number of columns as df used to fit _check_input_matches_training_df(X, self.n_features_in_) return X
def transform(self, X: pd.DataFrame): """ Return dataframe with selected features. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features]. The input dataframe from which feature values will be shuffled. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features - len(dropped features)] Pandas dataframe with the selected features. """ # check if fit is performed prior to transform check_is_fitted(self) # check if input is a dataframe X = _is_dataframe(X) # reset the index X = X.reset_index(drop=True) # check if number of columns in test dataset matches to train dataset _check_input_matches_training_df(X, self.input_shape_[1]) return X[self.selected_features_]
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Apply the transformation to the dataframe. Only the selected features will be modified. If transformer is OneHotEncoder, dummy features are concatenated to the source dataset. Note that the original categorical variables will not be removed from the dataset after encoding. If this is the desired effect, please use Feature-engine's OneHotEncoder instead. Parameters ---------- X : Pandas DataFrame The data to transform Raises ------ TypeError If the input is not a Pandas DataFrame Returns ------- X : Pandas DataFrame The transformed dataset. """ # check that input is a dataframe X = _is_dataframe(X) # Check that input data contains same number of columns than # the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) if isinstance(self.transformer, OneHotEncoder): ohe_results_as_df = pd.DataFrame( data=self.transformer.transform(X[self.variables]), columns=self.transformer.get_feature_names(self.variables), ) X = pd.concat([X, ohe_results_as_df], axis=1) elif isinstance(self.transformer, (SelectKBest, SelectPercentile, SelectFromModel)): # the variables selected by the transformer selected_variables = X.columns[self.transformer.get_support( indices=True)] # the variables that were not examined, in case there are any remaining_variables = [ var for var in X.columns if var not in self.variables ] X = X[list(selected_variables) + list(remaining_variables)] else: X[self.variables] = self.transformer.transform(X[self.variables]) return X
def _check_transform_input_and_state(self, X): # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check that input df contains same number of columns as df used to fit _check_input_matches_training_df(X, self.input_shape_[1]) return X
def transform(self, X): """Apply the transformation to the dataframe.""" # check that input is a dataframe X = _is_dataframe(X) # Check that input data contains same number of columns than # the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) X[self.variables] = self.transformer.transform(X[self.variables]) return X
def transform(self, X): # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) return X
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check input data contains same number of columns as df used to fit _check_input_matches_training_df(X, self.input_shape_[1]) return X
def _check_transform_input_and_state(self, X): # check if class was fitted check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) return X
def transform(self, X): """ Replaces categories with the learned parameters. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features]. The input samples. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. The dataframe containing categories replaced by numbers. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace categories by the learned parameters for feature in self.encoder_dict_.keys(): X[feature] = X[feature].map(self.encoder_dict_[feature]) # check if NaN values were introduced by the encoding if X[self.encoder_dict_.keys()].isnull().sum().sum() > 0: warnings.warn( "NaN values were introduced in the returned dataframe by the encoder." "This means that some of the categories in the input dataframe were " "not present in the training set used when the fit method was called. " "Thus, mappings for those categories does not exist. Try using the " "RareLabelCategoricalEncoder to remove infrequent categories before " "calling this encoder.") return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Combine the variables with the mathematical operations. Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the variable(s) contain null values when missing_values = raise - If the dataframe is not of the same size as that used in fit() Returns ------- X: Pandas dataframe, shape = [n_samples, n_features + n_operations] The dataframe with the original variables plus the new variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check if input data contains same number of columns as dataframe used to fit. _check_input_matches_training_df(X, self.n_features_in_) # check if dataset contains na if self.missing_values == "raise": _check_contains_na(X, self.variables_to_combine) _check_contains_inf(X, self.variables_to_combine) # combine mathematically for new_variable_name, operation in self.combination_dict_.items(): X[new_variable_name] = X[self.variables_to_combine].agg(operation, axis=1) return X
def transform(self, X): """ Removes observations with outliers from the dataframe. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe without outlier observations. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) for feature in self.right_tail_caps_.keys(): outliers = np.where(X[feature] > self.right_tail_caps_[feature], True, False) X = X.loc[~outliers] for feature in self.left_tail_caps_.keys(): outliers = np.where(X[feature] < self.left_tail_caps_[feature], True, False) X = X.loc[~outliers] return X
def transform(self, X): """ Caps the variable values, that is, censors outliers. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The data to be transformed. Returns ------- X_transformed : pandas dataframe of shape = [n_samples, n_features] The dataframe with the capped variables. """ # check if class was fitted check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) if self.missing_values == 'raise': # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace outliers for feature in self.right_tail_caps_.keys(): X[feature] = np.where(X[feature] > self.right_tail_caps_[feature], self.right_tail_caps_[feature], X[feature]) for feature in self.left_tail_caps_.keys(): X[feature] = np.where(X[feature] < self.left_tail_caps_[feature], self.left_tail_caps_[feature], X[feature]) return X
def transform(self, X): """ Creates the dummy / binary variables. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The data to transform. Returns ------- X_transformed : pandas dataframe. The shape of the dataframe will be different from the original as it includes the dummy variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) for feature in self.variables: for category in self.encoder_dict_[feature]: X[str(feature) + '_' + str(category)] = np.where( X[feature] == category, 1, 0) # drop the original non-encoded variables. X.drop(labels=self.variables, axis=1, inplace=True) return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Checks that the input is a dataframe and of the same size than the one used in the fit method. Checks absence of NA and Inf. Parameters ---------- X : Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame ValueError - If the variable(s) contain null values - If the df has different number of features than the df used in fit() Returns ------- X : Pandas DataFrame. The same dataframe entered by the user. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # Check if input data contains same number of columns as dataframe used to fit. _check_input_matches_training_df(X, self.n_features_in_) # check if dataset contains na or inf _check_contains_na(X, self.variables_) _check_contains_inf(X, self.variables_) return X
def _check_transform_input_and_state(self, X: pd.DataFrame) -> pd.DataFrame: """Checks that the input is a dataframe and of the same size than the one used in the fit method. Checks absence of NA. Parameters ---------- X : Pandas DataFrame Raises ------ TypeError If the input is not a Pandas DataFrame ValueError If the dataframe is not of same size as that used in fit() Returns ------- X : Pandas DataFrame The same dataframe entered by the user. """ # check if class was fitted check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) if self.missing_values == "raise": # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) return X
def inverse_transform(self, X): """ Convert the data back to the original representation. Parameters ---------- X_transformed : pandas dataframe of shape = [n_samples, n_features]. The transformed dataframe. Returns ------- X : pandas dataframe of shape = [n_samples, n_features]. The un-transformed dataframe, that is, containing the original values of the categorical variables. """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check that the dataframe contains the same number of columns # than the dataframe # used to fit the imputer. _check_input_matches_training_df(X, self.input_shape_[1]) # replace encoded categories by the original values for feature in self.encoder_dict_.keys(): inv_map = {v: k for k, v in self.encoder_dict_[feature].items()} X[feature] = X[feature].map(inv_map) return X
def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Applies transformation to the DataFrame. Args: X: Pandas DataFrame to apply the transformation Returns: Transformed DataFrame """ # Check method fit has been called check_is_fitted(self) # check that input is a dataframe X = _is_dataframe(X) # check if dataset contains na _check_contains_na(X, self.variables) # Check if input data contains same number of columns as dataframe used to fit. _check_input_matches_training_df(X, self.input_shape_[1]) return X