def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') categories = self.fit_leave_one_out( X, y, cols=self.cols ) self.mapping = categories X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # unite the input into pandas types X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # if we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X X = self.transform_leave_one_out( X, y, mapping=self.mapping ) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y, **kwargs): # unite the input into pandas types X = utils.convert_input(X) y = utils.convert_input(y) y.columns = ['target'] # apply one-hot-encoder on the label self.label_encoder = encoders.OneHotEncoder(handle_missing='error', handle_unknown='error', cols=['target'], drop_invariant=True, use_cat_names=True) labels = self.label_encoder.fit_transform(y) labels.columns = [column[7:] for column in labels.columns] labels = labels.iloc[:, 1:] # drop one label # train the feature encoders for class_name, label in labels.iteritems(): self.feature_encoders[class_name] = copy.deepcopy( self.feature_encoder).fit(X, label)
def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Will use the mapping (if available) and the column list (if available, otherwise every column) to encode the data ordinarily. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values X, _ = self.ordinal_encoding(X, mapping=self.mapping, cols=self.cols, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) X = X.drop_duplicates(subset=self.cols) if self.cols else X self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ X = X_in.copy(deep=True) # first check the type X = util.convert_input(X) if self._dim is None: raise ValueError( 'Must train encoder before it can be used to inverse_transform data' ) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError( "Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data" % (X.shape[1], )) else: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values if self.impute_missing and self.handle_unknown == 'impute': for col in self.cols: if any(X[col] == -1): raise ValueError( "inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col, )) for switch in self.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype( switch.get('data_type')) return X if self.return_df else X.values
def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X X_out = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X_out[self.cols].isin([-1]).any().any(): raise ValueError('Columns to be encoded can not contain new values') X_out = self.basen_encode(X_out, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X_out.drop(col, 1, inplace=True) # impute missing values only in the generated columns # generated_cols = util.get_generated_cols(X, X_out, self.cols) # X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0) if self.return_df or override_return_df: return X_out else: return X_out.values
def fit(self, X, y=None): X = util.convert_input(X) self.columns = X.columns self.statistics_ = np.array([self.fill_value] * len(self.columns), dtype='object') if self.strategy == "most_frequent": for i, column in enumerate(X.columns): for value, counts in Counter(X[column]).most_common(): if not pd.isna(value): self.statistics_[i] = value break return self
def transform(self, X, y=None): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target info (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # if we are encoding the training data, we have to check the target if y is not None: if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: y = pd.Series(y, name='target') if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X X, _ = self.target_encode( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, min_samples_leaf=self.min_samples_leaf, smoothing_in=self.smoothing ) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) for col in self.cols: self.digits_per_col[col] = self.calc_required_digits(X, col) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target info (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == "error": if X[self.cols].isnull().any().any(): raise ValueError("Columns to be encoded can not contain null") if self._dim is None: raise ValueError( "Must train encoder before it can be used to transform data.") # unite the input into pandas types X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError("Unexpected input dimension %d, expected %d" % ( X.shape[1], self._dim, )) if not list(self.cols): return X X = self._transform_cooc_encode(X) # TODO: if self.handle_unknown == "error": if X[self.cols].isin([-1]).any().any(): raise ValueError("Unexpected categories found in dataframe") if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def transform(self, X, y=None): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform withour target infor(such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) assert (y is None or X.shape[0] == y.shape[0]) if not self.cols: return X X, _ = self.leave_one_out(X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Will attempt best case reconstruction, which means it will return nan for handle_missing and handle_unknown settings that break the bijection. We issue warnings when some of those cases occur. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') # first check the type and make deep copy X = util.convert_input(X_in, deep=True) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "be False when transforming the data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not list(self.cols): return X if self.return_df else X.values if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col,)) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[col].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) for switch in self.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.values) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) return X if self.return_df else X.values
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) # y = util.convert_input_vector(y, X.index) if X.shape[0] != y.shape[0]: raise ValueError( "The length of X is {} but length of y is {}.".format( X.shape[0], y.shape[0])) self._dim = X.shape[1] if self.handle_missing == "error": if X[self.cols].isnull().any().any(): raise ValueError("Columns to be encoded can not contain null") self._fit_cooc_encode(self, X, y=None) # TODO: after transform if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X original_cols = set(X.columns) X = self.ordinal_encoder.transform(X) X = self.basen_encode(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) # impute missing values only in the generated columns current_cols = set(X.columns) fillna_cols = list(current_cols - (original_cols - set(self.cols))) X[fillna_cols] = X[fillna_cols].fillna(value=0.0) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) self._check_set_create_dict_attrs() self._fit_count_encode(X, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: y = pd.Series(y, name='target') if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = self.target_encode( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, smoothing_in=self.smoothing, min_samples_leaf=self.min_samples_leaf) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) if isinstance(y, pd.DataFrame): y = y.iloc[:, 0].astype(float) else: y = pd.Series(y, name='target', dtype=float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) categories = self.fit_leave_one_out(X, y, cols=self.cols) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) _, categories = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def transform(self, X): """Perform the transformation to new categorical data. Will use the mapping (if available) and the column list (if available, otherwise every column) to encode the data ordinally. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X X, _ = self.ordinal_encoding(X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def transform(self, X, y=None): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform withour target infor(such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) assert (y is None or X.shape[0] == y.shape[0]) if not self.cols: return X X, _ = self.leave_one_out( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) generated_cols = get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] return self
def fit_transform(self, X, y=None, **fit_params): # When we are training the feature encoders, we have to use fit_transform() method on the features. # unite the input into pandas types X = utils.convert_input(X) y = utils.convert_input(y) y.columns = ['target'] # apply one-hot-encoder on the label self.label_encoder = encoders.OneHotEncoder(handle_missing='error', handle_unknown='error', cols=['target'], drop_invariant=True, use_cat_names=True) labels = self.label_encoder.fit_transform(y) labels.columns = [column[7:] for column in labels.columns] labels = labels.iloc[:, 1:] # drop one label # initialization of the feature encoders encoded = None feature_encoder = None all_new_features = pd.DataFrame() # fit_transform the feature encoders for class_name, label in labels.iteritems(): feature_encoder = copy.deepcopy(self.feature_encoder) encoded = feature_encoder.fit_transform(X, label) # decorate the encoded features with the label class suffix new_features = encoded[feature_encoder.cols] new_features.columns = [str(column) + '_' + class_name for column in new_features.columns] all_new_features = pd.concat((all_new_features, new_features), axis=1) self.feature_encoders[class_name] = feature_encoder # add features that were not encoded result = pd.concat((encoded[encoded.columns[~encoded.columns.isin(feature_encoder.cols)]], all_new_features), axis=1) return result
def transform(self, X): """Perform the transformation to new categorical data. Will use the mapping (if available) and the column list (if available, otherwise every column) to encode the data ordinally. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim, )) if not self.cols: return X X, _ = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ X = X_in.copy(deep=True) # first check the type X = util.convert_input(X) if self._dim is None: raise ValueError( 'Must train encoder before it can be used to inverse_transform data') X = self.reverse_dummies(X, self.mapping) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) return X if self.return_df else X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.random_state_ = check_random_state(self.random_state) _, categories = self.leave_one_out(X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.random_state_ = check_random_state(self.random_state) _, categories = self.leave_one_out( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.mapping = categories if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y=None, categorical_feature=None, numerical_feature=None, **kwargs): X = util.convert_input(X) if categorical_feature is not None: self.categorical_feature = categorical_feature if numerical_feature is not None: self.numerical_feature = numerical_feature # 自动找category特征 if self.categorical_feature is None and self.numerical_feature is None: self.logger.warning( f"You didn't declare numerical_feature or categorical_feature in {self.__class__.__name__}, " f"program will auto find these by dtypes.") self.categorical_feature = X.select_dtypes( include=["object", "category"]).columns self.numerical_feature = X.select_dtypes( exclude=["object", "category"]).columns else: if self.categorical_feature is None: if self.numerical_feature is not None: self.categorical_feature = X.columns.difference( self.numerical_feature) else: self.categorical_feature = np.array([]) if numerical_feature is None: if self.categorical_feature is not None: self.numerical_feature = X.columns.difference( self.categorical_feature) else: self.numerical_feature = np.array([]) # todo: 统计各列的缺失率,过高则删除 missing_rates = np.count_nonzero(pd.isna(X), axis=0) / X.shape[0] self.missing_rates = missing_rates.tolist() drop_mask = missing_rates >= self.missing_rate self.drop_mask = drop_mask drop_columns = X.columns[drop_mask] self.drop_columns = drop_columns.tolist() if len(drop_columns): self.numerical_feature = np.setdiff1d(self.numerical_feature, drop_columns) self.categorical_feature = np.setdiff1d(self.categorical_feature, drop_columns) X = X.drop(drop_columns, axis=1) return X
def inverse_transform(self, X_in): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ # fail fast if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') # first check the type and make deep copy X = util.convert_input(X_in, columns=self.feature_names, deep=True) X = self.reverse_dummies(X, self.mapping) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "be False when transforming the data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[switch.get('col')].isnull().any(): warnings.warn("inverse_transform is not supported because transform impute " "the unknown category nan when encode %s" % (col,)) return X if self.return_df else X.values
def inverse_transform(self, Xt): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ warnings.warn('Inverse transform in basen is a currently experimental feature, please be careful') X = Xt.copy(deep=True) # first check the type X = convert_input(X) if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') X = self.basen_to_interger(X, self.cols, self.base) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data" % (X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X if self.return_df else X.values if self.impute_missing and self.handle_unknown == 'impute': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col,)) for switch in self.ordinal_encoder.mapping: col_dict = {col_pair[1]: col_pair[0] for col_pair in switch.get('mapping')} X[switch.get('col')] = X[switch.get('col')].apply(lambda x: col_dict.get(x)) return X if self.return_df else X.values
def transform(self, X): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X X = self.hashing_trick(X, hashing_method=self.hash_method, N=self.n_components, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df: return X else: return X.values
def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values X = self.ordinal_encoder.transform(X) X = self.get_dummies(X, mapping=self.mapping) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def inverse_transform(self, Xt): """ Perform the inverse transformation to encoded data. Parameters ---------- X_in : array-like, shape = [n_samples, n_features] Returns ------- p: array, the same size of X_in """ X = Xt.copy(deep=True) # first check the type X = convert_input(X) if self._dim is None: raise ValueError('Must train encoder before it can be used to inverse_transform data') X = self.reverse_dummies(X, self.cols) # then make sure that it is the right size if X.shape[1] != self._dim: if self.drop_invariant: raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " "set as False when transform data"%(X.shape[1],)) else: raise ValueError('Unexpected input dimension %d, expected %d'% (X.shape[1], self._dim, )) if not self.cols: return X if self.return_df else X.values if self.impute_missing and self.handle_unknown == 'impute': for col in self.cols: if any(X[col] == 0): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s"%(col,)) if not self.use_cat_names: for switch in self.ordinal_encoder.mapping: col_dict = {col_pair[1] : col_pair[0] for col_pair in switch.get('mapping')} X[switch.get('col')] = X[switch.get('col')].apply(lambda x:col_dict.get(x)) return X if self.return_df else X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) _, categories = self.ordinal_encoding( X, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.mapping = categories # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y, **kwargs): # first check the type X = convert_input(X) y = pd.Series(y, name='target') assert X.shape[0] == y.shape[0] _, self.mapping = self.target_encode( X, y, mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self._dim = X.shape[1] return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # first check the type X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not list(self.cols): return X X, _ = self._transform_count_encode(X, y) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, impute_missing=self.impute_missing, handle_unknown=self.handle_unknown) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [ x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5 ] return self
def fit(self, X, y=None, **kwargs): X = convert_input(X) # todo drop var = 0 do_not_replace_by_other = list() # 遍历每列 for column in X.columns: do_not_replace_by_other.append([]) counter = Counter(X[column]) colsize = X.shape[0] if X[column].dtype.name == "category": categories = list(X[column].cat.categories) else: categories = list(set(X[column])) for unique_value in categories: count = counter[unique_value] minimum_fraction = float(count) / colsize if minimum_fraction >= self.minimum_fraction: do_not_replace_by_other[-1].append(unique_value) self.do_not_replace_by_other_ = do_not_replace_by_other return self
def fit(self, X, y=None, **kwargs): """Fits an ordinal encoder to produce a consistent mapping across applications and optionally finds generally invariant columns to drop consistently. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # first check the type X = convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) if not self.cols: return X X = self.ordinal_encoder.transform(X) X = self.basen_encode(X, cols=self.cols) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) X.fillna(0.0, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols) self.ordinal_encoder = self.ordinal_encoder.fit(X) if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = get_obj_cols(X) # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] X_temp = self.transform(X) self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5] return self
def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # if the input dataset isn't already a dataframe, convert it to one (using default column names) # first check the type X = util.convert_input(X) self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) ordinal_mapping = self.ordinal_encoder.category_mapping mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') col = switch.get('col') column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # drop all output columns with 0 variance. if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values