def fit(self, data): ''' fits the catagorical encoder, coecces to a pd data frame, save input and feature names :param data: a pandas data frame, or list :return: nothing, fitted encoder is saved as encoder ''' from category_encoders import OneHotEncoder ohe = OneHotEncoder(return_df=self.return_df, handle_unknown=self.handle_unknown) x = self.replace_infrequent_df(data) self.input_names = x.columns ohe.fit(x) self.encoder = ohe self.feature_names_from_cat_encoder()
def categoricals(self, model_name='onehot_model.pkl', cols=None, owr=False, model_bin=None): """Onehot encoder on categoricals.""" self.log('Apply onehot encoder on categorical') model_path = os.path.join(self.model_path, model_name) if cols is None: cols = self.data.cat_cols if ((not os.path.isfile(model_path)) or owr) and (model_bin is None): self.log('\nTrain model\n') model_bin = OneHotEncoder( cols=cols, use_cat_names=True, handle_unknown='error', drop_invariant=False, impute_missing=False) model_bin.fit(self.data._X) self.data._X = model_bin.transform(self.data._X) setattr(model_bin, 'data_schema', self.data._X.columns.values) # Save model if self.auto_save: joblib.dump(model_bin, model_path) elif os.path.isfile(model_path): # File exists/prediction: model_bin = joblib.load(model_path) self.data._X = model_bin.transform(self.data._X) self.data.check_schema(model_bin, '_X') else: # Prediction in pipeline self.data._X = model_bin.transform(self.data._X) self.data.check_schema(model_bin, '_X') return model_bin
def encode_low_cardinality_categorical_df(dataframe, fit=False): """ Encode low cardinality categorical features using OneHot Encoding and dropping invariant features --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'low_card_categorical_encoder') else: encoder = unpickle_obj('low_card_categorical_encoder') # transform data return encoder.transform(dataframe)
def fit_onehot(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the One-hot encoder by fitting it through the given DataFrame NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_onehot` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) drop_cols = ["{}_nan".format(col) for col in cols] encoder = OneHotEncoder(cols=cols, use_cat_names=True) encoder = encoder.fit(df) result_df = encoder.transform(df) for drop_col in drop_cols: if drop_col in result_df.columns: result_df = result_df.drop(columns=[drop_col]) model = { "encoder": encoder, "cols": cols, "na_value": na_value, "drop_cols": drop_cols, } return result_df, model
train['type'].unique(), train['color'].unique() sns.violinplot(x='bone_length', y='type', data=train) sns.boxplot(x='hair_length', y='type', data=train) sns.pairplot(train) from category_encoders import OneHotEncoder encoder = OneHotEncoder(cols=['color'], use_cat_names=True) train = encoder.fit_transform(train) test = encoder.fit_transform(test) train.head() from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(train['type']) print(encoder.classes_) train['type_no'] = encoder.transform(train['type']) train.head() sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train)) target = train['type_no'] # for visualizations target_string = train['type'] # for final predictions del train['type'] del train['type_no'] target.head() from sklearn.model_selection import train_test_split
class RFEncoder(BaseEstimator, TransformerMixin): def __init__(self, cols=None, handle_missing='value', handle_unknown='value', use_cat_names=False, return_df=True, max_subsets=None, max_depth=3, n_estimators=100, min_count=1, n_jobs=1): self.cols = cols self.handle_missing = handle_missing self.handle_unknown = handle_unknown self.use_cat_names = use_cat_names self.return_df = return_df self.max_subsets = max_subsets self.max_depth = max_depth self.n_estimators = n_estimators self.n_jobs = n_jobs self.min_count = min_count def fit(self, X, y=None): self._dim = X.shape[1] if self.cols is None: self.cols = get_obj_cols(X) self.dummy_encoder = OneHotEncoder(cols=self.cols, handle_unknown='value', handle_missing='value') self.dummy_encoder = self.dummy_encoder.fit(X) self.mapping = self.generate_mapping(X, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) return self def generate_mapping(self, X, y): X = self.dummy_encoder.transform(X.copy(deep=True)) y = y.copy(deep=True) mapping = [] for switch in self.dummy_encoder.mapping: col = switch.get('col') values = switch.get('mapping').copy(deep=True) if isinstance(self.max_depth, int): max_depth = self.max_depth elif isinstance(self.max_depth, float): max_depth = round(self.max_depth * values.shape[1]) else: max_depth = min(self.max_depth[1], round(self.max_depth[0] * values.shape[1])) if max_depth == 0: continue forest = RandomForestClassifier( max_depth=max_depth, n_estimators=self.n_estimators, n_jobs=self.n_jobs, ) forest.fit(X[values.columns], y) subsets = self.get_subsets(forest.decision_path(values)) subset_df = pd.DataFrame(data=subsets, index=values.index, columns=[ '{col}_subset_{i}'.format(col=col, i=i) for i in range(subsets.shape[1]) ]) base_df = values.join(subset_df) mapping.append({'col': col, 'mapping': base_df}) return mapping def get_subsets(self, decision_path): subset_sizes = np.asarray(decision_path[0].sum(axis=0))[0] subsets = decision_path[0][:, subset_sizes != 1].toarray() subsets, count = np.unique(subsets, return_counts=True, axis=1) subsets = subsets[:, count >= self.min_count] count = count[count >= self.min_count] subsets = subsets[:, np.argsort(-count)] subset_sizes = subsets.sum(axis=0) subsets = subsets[:, np.argsort(subset_sizes)] if self.max_subsets is not None: subsets = subsets[:, :self.max_subsets] return subsets def transform(self, X, override_return_df=False): if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not list(self.cols): return X if self.return_df else X.values X = self.dummy_encoder.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError( 'Columns to be encoded can not contain new values') X = self.get_dummies(X) if self.return_df or override_return_df: return X else: return X.values def get_dummies(self, X_in): X = X_in.copy(deep=True) cols = X.columns.values.tolist() for switch in self.mapping: col = switch.get('col') mod = switch.get('mapping') base_df = mod.reindex(X[col]) base_df = base_df.set_index(X.index) X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) cols[old_column_index:old_column_index + 1] = mod.columns X = X.reindex(columns=cols) return X def get_feature_names(self): if not isinstance(self.feature_names, list): raise ValueError( 'Must transform data first. Affected feature names are not known before.' ) else: return self.feature_names
review_desc= X.review_description # seperating review before encoding X=X.drop("review_description",axis=1) # In[248]: X.head() # In[249]: one_hot= OneHotEncoder(cols=["user_name","country","hint_variety"],use_cat_names=True) # OneHotEncoder from category_encoders package one_hot.fit(X) # In[250]: X=one_hot.transform(X) # In[253]: X.columns # #### Word2vec implementation -
class OneHotEncoder(): """Maps each categorical value to several columns using one-hot encoding. Parameters: cols: [str] list of column names to encode. top_n: int number of unique category values to encode (determines the number of resulting columns) selects based off of number of occurences of value defaults to 15 'None' will result in all unique values being encoded. """ name = 'one_hot' def __init__(self, cols=None, top_n=15): self.encoder = OneHot(cols=cols) self.matrix = None self.top_n = top_n def fit(self, X, features, y=None): """Fits encoder to data table. returns self """ self.encoder.fit(X, y=None) self.features = self.encode_features_list(X, features) return self def transform(self, X): """Encodes matrix and updates features accordingly. returns encoded matrix (dataframe) """ assert (self.matrix is not None), "Check that the encoder is fitted." return self.matrix def fit_transform(self, X, features=None, y=None): """First fits, then transforms matrix. returns encoded matrix (dataframe) """ return self.fit(X, features, y).transform(X) def get_mapping(self, category): """Gets the mapping for the one-hot encoder. returns mapping (dict) """ if isinstance(category, str): for map in self.encoder.mapping: if map['col'] == category: return map['mapping'] return self.encoder.mapping[category]['mapping'] def encode_features_list(self, X, features): X_new = X.copy() feature_list = [] for f in features: if f.number_output_features > 1: logger.warning( "Feature %s has multiple columns. One-Hot Encoder may not properly encode." "Consider using another encoding method or the `encoder` property value assigned " "to this OneHotEncoder class instance." % (f)) if f.get_name() in self.encoder.cols: val_counts = X[f.get_name()].value_counts().to_frame() val_counts.sort_values(f.get_name(), ascending=False) if self.top_n is None: self.top_n = len(val_counts) unique = val_counts.head(self.top_n).index.tolist() index = X_new.columns.get_loc(f.get_name()) for label in unique: add = ft.Feature([f], primitive=OneHotEnc(label)) feature_list.append(add) X_new.insert(index, add.get_name(), (X_new[f.get_name()] == label).astype(int), allow_duplicates=True) index += 1 has_unknown = X[f.get_name()].isnull().values.any() if has_unknown: unknown = ft.Feature([f], primitive=OneHotEnc(np.nan)) feature_list.append(unknown) X_new.insert( index, unknown.get_name(), (~X_new[f.get_name()].isin(unique)).astype(int), allow_duplicates=True) X_new.drop([f.get_name()], axis=1, inplace=True) else: feature_list.append(f) self.matrix = X_new return feature_list def get_features(self): return self.features def get_name(self): return self.name
df['bathrooms_text'].value_counts() df = df[df['bathrooms_text'] != ''] df[['host_is_superhost', 'bathrooms_text', 'has_availability', 'instant_bookable']].astype(float) df.head pip install category_encoders # Instantiate transformer - one hot encoder from category_encoders import OneHotEncoder transformer = OneHotEncoder(use_cat_names=True) # Transform to fit training data transformer.fit(df) # Transform our training data df = transformer.transform(df) X = df.drop('price', axis=1) y= df['price'] X = X.astype(float) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) from keras.layers import BatchNormalization, Dropout import keras model = Sequential([
import pandas as pd data = pd.read_csv('139394485_T_T100D_MARKET_ALL_CARRIER.csv') # Define col names for the parameters of the network pred_vars = ['MONTH', 'ORIGIN', 'DEST', 'DISTANCE'] target_var = 'PASSENGERS' keep = pred_vars keep.append(target_var) # Subset only what's needed data = data[keep] # Encode the source and target nodes using a catagory encoder from category_encoders import OneHotEncoder ce = OneHotEncoder() ce.fit(data) # transform the encoded data data_encoded = ce.transform(data) labels = data[target_var] data_encoded.drop(target_var, 1, inplace=True) # split out a final eval set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(data_encoded, labels, random_state=0, test_size=.25) # convert to xgb data format import xgboost as xgb
def get_score(model, X, y, X_test, y_test): model.fit(X, y) y_pred = model.predict_proba(X_test)[:, 1] score = roc_auc_score(y_test, y_pred) return score ######### Creating objects for 2 classification models. logit = LogisticRegression(random_state=SEED) rf = RandomForestClassifier(random_state=SEED) ################################################################################################### ######### Apply One Hot Encoding from category_encoders import OneHotEncoder onehot_enc = OneHotEncoder(cols=X_Columns) onehot_enc.fit(X_train, y_train) print('Original number of features: \n', X_train.shape[1], "\n") data_ohe_train = onehot_enc.fit_transform(X_train) data_ohe_test = onehot_enc.transform(X_test) print('Features after OHE: \n', data_ohe_train.shape[1]) ######### Logistic Regression onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test, y_test) print('Logistic Regression score with One hot encoding:', onehot_logit_score) ######### Random Forest onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test) print('Random Forest score with One hot encoding:', onehot_logit_score)