def categoricals(self, model_name='onehot_model.pkl', cols=None, owr=False, model_bin=None): """Onehot encoder on categoricals.""" self.log('Apply onehot encoder on categorical') model_path = os.path.join(self.model_path, model_name) if cols is None: cols = self.data.cat_cols if ((not os.path.isfile(model_path)) or owr) and (model_bin is None): self.log('\nTrain model\n') model_bin = OneHotEncoder( cols=cols, use_cat_names=True, handle_unknown='error', drop_invariant=False, impute_missing=False) model_bin.fit(self.data._X) self.data._X = model_bin.transform(self.data._X) setattr(model_bin, 'data_schema', self.data._X.columns.values) # Save model if self.auto_save: joblib.dump(model_bin, model_path) elif os.path.isfile(model_path): # File exists/prediction: model_bin = joblib.load(model_path) self.data._X = model_bin.transform(self.data._X) self.data.check_schema(model_bin, '_X') else: # Prediction in pipeline self.data._X = model_bin.transform(self.data._X) self.data.check_schema(model_bin, '_X') return model_bin
def _encode_categories(self): """ This private method stands for encoding categorical variables. Label encoding used for ordinal categories and one-hot encoding used for nominal categories. """ logging.info(f'#{self._index()} - Encoding categorical columns...') # get column names for categorical and numerical features categorical_vars = self.X.select_dtypes(include='object').columns numerical_vars = self.X.columns.difference(categorical_vars) ordinal = pd.Index([ 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ]) nominal = categorical_vars.difference(ordinal) standard_mapping = { 'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5 } mapping_for_ordinals = [{ 'col': column, 'mapping': standard_mapping } for column in ordinal] x_num = self.X[numerical_vars] x_test_num = self.X_test[numerical_vars] # one hot encode categorical columns one_hot_encoder = OneHotEncoder(use_cat_names=True) label_encoder = OrdinalEncoder(drop_invariant=True, mapping=mapping_for_ordinals, handle_unknown='error') x_cat_nom = one_hot_encoder.fit_transform(self.X[nominal]) x_cat_ord = label_encoder.fit_transform(self.X[ordinal]) x_test_cat_nom = one_hot_encoder.transform(self.X_test[nominal]) x_test_cat_ord = label_encoder.transform(self.X_test[ordinal]) self.X = x_num.join(x_cat_ord).join(x_cat_nom) self.X_test = x_test_num.join(x_test_cat_ord).join(x_test_cat_nom) logging.info(f'#{self._step_index} - DONE!')
def fit_onehot(input_df: pd.DataFrame, cols: List[str], na_value: Any = None): """ Creates the One-hot encoder by fitting it through the given DataFrame NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value. Args: input_df: DataFrame used to fit the encoder cols: List of categorical columns to be encoded na_value: Default null value for DataFrame Returns: result_df: encoded input_df DataFrame model : encoder model to be passed to `transform_onehot` method """ df = input_df.copy() if na_value is not None: for col in cols: df[col] = df[col].replace({na_value: np.nan}) drop_cols = ["{}_nan".format(col) for col in cols] encoder = OneHotEncoder(cols=cols, use_cat_names=True) encoder = encoder.fit(df) result_df = encoder.transform(df) for drop_col in drop_cols: if drop_col in result_df.columns: result_df = result_df.drop(columns=[drop_col]) model = { "encoder": encoder, "cols": cols, "na_value": na_value, "drop_cols": drop_cols, } return result_df, model
def encode_low_cardinality_categorical_df(dataframe, fit=False): """ Encode low cardinality categorical features using OneHot Encoding and dropping invariant features --- Arguments dataframe: pd.DataFrame Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only fit: boolean Indicates if we should train or load an encoder Returns dataframe: pd.DataFrame Dataframe with encoded data """ # Train or load an encoder if fit: encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True) encoder.fit(dataframe) pickle_obj(encoder, 'low_card_categorical_encoder') else: encoder = unpickle_obj('low_card_categorical_encoder') # transform data return encoder.transform(dataframe)
'PassengerId', 'Survived']] ########################################################################### # Split data into train and test # ########################################################################### trainData = fullData.loc[fullData.DataPartition == 'train'] testData = fullData.loc[fullData.DataPartition == 'test'] ########################################################################### # One hot encode # ########################################################################### # https://github.com/scikit-learn-contrib/categorical-encoding # http://contrib.scikit-learn.org/categorical-encoding/onehot.html from category_encoders import OneHotEncoder categories = list(set(trainData.select_dtypes(['category']).columns)) target = trainData.Survived enc = OneHotEncoder(cols=categories,return_df = 1, handle_unknown = 'ignore').fit(trainData, target) trainData = enc.transform(trainData) testData = enc.transform(testData) ########################################################################### # Drop multi collinear levels and no longer required # ########################################################################### dropColumns = ['DataPartition'] trainData = trainData.drop(columns=dropColumns) testData = testData.drop(columns=dropColumns) testData = testData.drop(columns='Survived') ########################################################################### # Start h2o cloud # ########################################################################### import h2o h2o.init() h2o.remove_all # clean slate, in case cluster was already running # upload data to h2o cloud
from category_encoders import OneHotEncoder encoder = OneHotEncoder(cols=['color'], use_cat_names=True) train = encoder.fit_transform(train) test = encoder.fit_transform(test) train.head() from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(train['type']) print(encoder.classes_) train['type_no'] = encoder.transform(train['type']) train.head() sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train)) target = train['type_no'] # for visualizations target_string = train['type'] # for final predictions del train['type'] del train['type_no'] target.head() from sklearn.model_selection import train_test_split train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=42) from mlxtend.plotting import plot_decision_regions import matplotlib.gridspec as gridspec import itertools
class RFEncoder(BaseEstimator, TransformerMixin): def __init__(self, cols=None, handle_missing='value', handle_unknown='value', use_cat_names=False, return_df=True, max_subsets=None, max_depth=3, n_estimators=100, min_count=1, n_jobs=1): self.cols = cols self.handle_missing = handle_missing self.handle_unknown = handle_unknown self.use_cat_names = use_cat_names self.return_df = return_df self.max_subsets = max_subsets self.max_depth = max_depth self.n_estimators = n_estimators self.n_jobs = n_jobs self.min_count = min_count def fit(self, X, y=None): self._dim = X.shape[1] if self.cols is None: self.cols = get_obj_cols(X) self.dummy_encoder = OneHotEncoder(cols=self.cols, handle_unknown='value', handle_missing='value') self.dummy_encoder = self.dummy_encoder.fit(X) self.mapping = self.generate_mapping(X, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) return self def generate_mapping(self, X, y): X = self.dummy_encoder.transform(X.copy(deep=True)) y = y.copy(deep=True) mapping = [] for switch in self.dummy_encoder.mapping: col = switch.get('col') values = switch.get('mapping').copy(deep=True) if isinstance(self.max_depth, int): max_depth = self.max_depth elif isinstance(self.max_depth, float): max_depth = round(self.max_depth * values.shape[1]) else: max_depth = min(self.max_depth[1], round(self.max_depth[0] * values.shape[1])) if max_depth == 0: continue forest = RandomForestClassifier( max_depth=max_depth, n_estimators=self.n_estimators, n_jobs=self.n_jobs, ) forest.fit(X[values.columns], y) subsets = self.get_subsets(forest.decision_path(values)) subset_df = pd.DataFrame(data=subsets, index=values.index, columns=[ '{col}_subset_{i}'.format(col=col, i=i) for i in range(subsets.shape[1]) ]) base_df = values.join(subset_df) mapping.append({'col': col, 'mapping': base_df}) return mapping def get_subsets(self, decision_path): subset_sizes = np.asarray(decision_path[0].sum(axis=0))[0] subsets = decision_path[0][:, subset_sizes != 1].toarray() subsets, count = np.unique(subsets, return_counts=True, axis=1) subsets = subsets[:, count >= self.min_count] count = count[count >= self.min_count] subsets = subsets[:, np.argsort(-count)] subset_sizes = subsets.sum(axis=0) subsets = subsets[:, np.argsort(subset_sizes)] if self.max_subsets is not None: subsets = subsets[:, :self.max_subsets] return subsets def transform(self, X, override_return_df=False): if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) if not list(self.cols): return X if self.return_df else X.values X = self.dummy_encoder.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError( 'Columns to be encoded can not contain new values') X = self.get_dummies(X) if self.return_df or override_return_df: return X else: return X.values def get_dummies(self, X_in): X = X_in.copy(deep=True) cols = X.columns.values.tolist() for switch in self.mapping: col = switch.get('col') mod = switch.get('mapping') base_df = mod.reindex(X[col]) base_df = base_df.set_index(X.index) X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) cols[old_column_index:old_column_index + 1] = mod.columns X = X.reindex(columns=cols) return X def get_feature_names(self): if not isinstance(self.feature_names, list): raise ValueError( 'Must transform data first. Affected feature names are not known before.' ) else: return self.feature_names
def _encode_categories(self): """ This private method stands for encoding categorical variables. Label encoding used for ordinal categories and one-hot encoding used for nominal categories. """ logging.info(f'#{self._index()} - Encoding categorical columns...') def encode(data): # encode Sex column data['Sex'] = data['Sex'] == 'male' # encode Name column name_cols = data['Name'].apply(lambda x: pd.Series( [str(x).split(",")[0], str(x).split(", ")[1].split(".")[0]], index=['Family name', 'Title'])) data = data.join(name_cols) # identify Titles with same meaning data['Title'].replace({ 'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs' }, inplace=True) # group rare Titles title_names = (data['Title'].value_counts() < 10) data['Title'] = data['Title'].apply(lambda x: 'Misc' if title_names.loc[x] else x) # create Family size and Alone column from SibSp, Parch cols data['Family size'] = data['SibSp'] + data['Parch'] + 1 data['Alone'] = data['Family size'] == 1 # make 5 equal size groups from Fares data['Fare'] = pd.qcut(data['Fare'], 5, labels=False) # make 5 groups from Ages data['Age'] = pd.cut(data['Age'], 5, labels=False) # rename columns and delete unnecessary features data = data.rename(columns={ 'Sex': 'Male', 'Fare': 'FareBins', 'Age': 'AgeBins' }) data.drop(['Name', 'SibSp', 'Parch'], axis=1, inplace=True) return data self.X = encode(self.X) self.X_test = encode(self.X_test) for col in self.X.columns: if self.X[col].dtype != 'float64': table = self.X.join(self.y)[[col, 'Survived' ]].groupby(col, as_index=False).mean() table['Survived'] = (table['Survived'] * 100).map( '{:.2f} %'.format) logging.info( f'Survival ratio by: {col}\n{table}\n{"-" * 10}\n') one_hot_encoder = OneHotEncoder(use_cat_names=True) one_hot_columns = one_hot_encoder.fit_transform( self.X[['Title', 'Embarked']]) one_hot_columns_test = one_hot_encoder.transform( self.X_test[['Title', 'Embarked']]) self.X = self.X.join(one_hot_columns) self.X_test = self.X_test.join(one_hot_columns_test) self.X.drop(['Family name', 'Title', 'Embarked'], axis=1, inplace=True) self.X_test.drop(['Family name', 'Title', 'Embarked'], axis=1, inplace=True) logging.info(f'#{self._step_index} - DONE!')
# StandardScaler - pre-processor to put numerical column in the same scale scaler = StandardScaler().fit(x_train) scaler values_scale = scaler.transform(x_train) values_scale[:10] x_train = scaler.transform(x_train) # generate the model - could be any model # instance of the classifier decision tree and train the model clf_tree = tree.DecisionTreeClassifier() clf_tree = clf_tree.fit(x_train, y_train) # Apply object ohe and pre-processor on data for test x_test = ohe.transform(x_test) scaler_test = StandardScaler().fit(x_test) x_test = scaler_test.transform(x_test) x_test[:10] # predict clf_tree.predict(x_test) # Validate the model acuracy = clf_tree.score(x_test, y_test) acuracy # Pipeline # will create a kind of alias for each method pip_1 = Pipeline([('ohe', OneHotEncoder()), ('scaler', StandardScaler()),
X.head() # In[249]: one_hot= OneHotEncoder(cols=["user_name","country","hint_variety"],use_cat_names=True) # OneHotEncoder from category_encoders package one_hot.fit(X) # In[250]: X=one_hot.transform(X) # In[253]: X.columns # #### Word2vec implementation - # In[259]: wine2vec.build_vocab(review_desc.values)
df[['host_is_superhost', 'bathrooms_text', 'has_availability', 'instant_bookable']].astype(float) df.head pip install category_encoders # Instantiate transformer - one hot encoder from category_encoders import OneHotEncoder transformer = OneHotEncoder(use_cat_names=True) # Transform to fit training data transformer.fit(df) # Transform our training data df = transformer.transform(df) X = df.drop('price', axis=1) y= df['price'] X = X.astype(float) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) from keras.layers import BatchNormalization, Dropout import keras model = Sequential([ # The Input Layer : Dense(1024, input_dim = X_train.shape[1]),
# Define col names for the parameters of the network pred_vars = ['MONTH', 'ORIGIN', 'DEST', 'DISTANCE'] target_var = 'PASSENGERS' keep = pred_vars keep.append(target_var) # Subset only what's needed data = data[keep] # Encode the source and target nodes using a catagory encoder from category_encoders import OneHotEncoder ce = OneHotEncoder() ce.fit(data) # transform the encoded data data_encoded = ce.transform(data) labels = data[target_var] data_encoded.drop(target_var, 1, inplace=True) # split out a final eval set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(data_encoded, labels, random_state=0, test_size=.25) # convert to xgb data format import xgboost as xgb dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test)
]] ########################################################################### # Split data into train and test # ########################################################################### trainData = fullData.loc[fullData.DataPartition == 'train'] testData = fullData.loc[fullData.DataPartition == 'test'] ########################################################################### # One hot encode # ########################################################################### # https://github.com/scikit-learn-contrib/categorical-encoding # http://contrib.scikit-learn.org/categorical-encoding/onehot.html categories = list(set(trainData.select_dtypes(['category']).columns)) target = trainData.Survived enc = OneHotEncoder(cols=categories, return_df=1, handle_unknown='ignore').fit(trainData, target) trainData = enc.transform(trainData) testData = enc.transform(testData) ########################################################################### # Drop multi collinear levels and no longer required # ########################################################################### dropColumns = ['DataPartition'] trainData = trainData.drop(columns=dropColumns) testData = testData.drop(columns=dropColumns) testData = testData.drop(columns='Survived') ########################################################################### # Start h2o cloud # ########################################################################### h2o.init() h2o.remove_all # clean slate, in case cluster was already running # upload data to h2o cloud train = h2o.H2OFrame(trainData)
return score ######### Creating objects for 2 classification models. logit = LogisticRegression(random_state=SEED) rf = RandomForestClassifier(random_state=SEED) ################################################################################################### ######### Apply One Hot Encoding from category_encoders import OneHotEncoder onehot_enc = OneHotEncoder(cols=X_Columns) onehot_enc.fit(X_train, y_train) print('Original number of features: \n', X_train.shape[1], "\n") data_ohe_train = onehot_enc.fit_transform(X_train) data_ohe_test = onehot_enc.transform(X_test) print('Features after OHE: \n', data_ohe_train.shape[1]) ######### Logistic Regression onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test, y_test) print('Logistic Regression score with One hot encoding:', onehot_logit_score) ######### Random Forest onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test) print('Random Forest score with One hot encoding:', onehot_logit_score) ################################################################################################### ######### Apply Hashing Encoding from category_encoders import HashingEncoder hashing_enc = HashingEncoder(n_components=10000, cols=X_Columns)
class Encoder(): encode_methods = { 'OrdinalEncoder': OrdinalEncoder, 'OneHotEncoder': OneHotEncoder, 'CountEncoder': CountEncoder, 'TargetEncoder': TargetEncoder, } # spark_encode_methods = { # 'mean_encoder':, # 'target_encoder':, # 'label_encoder':, # 'onehot_encoder' # } # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码 # label_encoder,onehot_encoder可以 def __init__(self, sparksess=None, logdir='/encoder', handle_unknown='-99999', save_encoder=False): self.spark = sparksess self.logdir = logdir self.save_encoder self.ordinal_encoder_features = [] self.onehot_encoder_features = [] self.count_encoder_features = [] self.target_encoder_features = [] self.ordinal_encoder = OrdinalEncoder( cols=self.ordinal_encoder_features, return_df=True, handle_unknown=handle_unknown) self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features, return_df=True, handle_unknown=handle_unknown) self.count_encoder = CountEncoder(cols=self.count_encoder_features, return_df=True, handle_unknown=handle_unknown) self.target_encoder = TargetEncoder(cols=self.target_encoder_features, return_df=True, handle_unknown=handle_unknown) def fit(self, x_train, x_val=None, y_train=None, y_val=None, method_mapper=None): """ Parameters ---------- x_train: pd.DataFrame x_val: pd.DataFrame y_train: pd.DataFrame y_val: pd.DataFrame method_mapper: dict a mapping of feature to EncodeMethod example mapping: { 'feature1': OrdinalEncoder, 'feature2': OneHotEncoder, 'feature3': CountEncoder, 'feature4': TargetEncoder, } """ for feat in method_mapper: if method_mapper[feat] == 'OrdinalEncoder': self.ordinal_encoder_features.append(feat) elif method_mapper[feat] == 'OneHotEncoder': self.onehot_encoder_features.append(feat) elif method_mapper[feat] == 'CountEncoder': self.count_encoder_features.append(feat) elif method_mapper[feat] == 'TargetEncoder': self.target_encoder_features.append(feat) else: raise ValueError( '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s' % feat) if self.spark is None: if len(self.ordinal_encoder_features) != 0 or len( self.onehot_encoder_features) != 0: x_whole = x_train.append(x_val) y_whole = None if not y_train is None and not y_val is None: y_whole = y_train.append(y_val) x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole) x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole) x_train = x_whole[:len(x_train)] x_val = x_whole[len(x_train):] x_train = self.count_encoder.fit_transform(x_train, y_train) x_val = self.count_encoder.transform(x_val, y_val) x_train = self.target_encoder.fit_transform(x_train, y_train) x_val = self.target_encoder.transform(x_val, y_val) if self.save_encoder: self.save_encoder() return x_train, y_train, x_val, y_val def transform(self, x, y=None): x = self.ordinal_encoder.transform(x, y) x = self.onehot_encoder.transform(x, y) x = self.count_encoder.transform(x, y) x = self.target_encoder.transform(x, y) return x, y def fit_transform(self, x_train, x_val=None, y_train=None, y_val=None, method_mapper=None): """ Parameters ---------- x_train: pd.DataFrame x_val: pd.DataFrame y_train: pd.DataFrame y_val: pd.DataFrame method_mapper: dict a mapping of feature to EncodeMethod example mapping: { 'feature1': OrdinalEncoder, 'feature2': OneHotEncoder, 'feature3': CountEncoder, 'feature4': TargetEncoder, } """ self.fit(x_train, x_val, y_train, y_val, method_mapper) x_train, y_train = self.transform(x_train, y_train) if x_val is not None: x_val, y_val = self.transform(x_val, y_val) return x_train, y_train, x_val, y_val def save_encoder(self): now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) os.makedirs(os.path.join(self.logdir, now)) with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'), 'wb') as f: pickle.dump(self.ordinal_encoder, f) with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'), 'wb') as f: pickle.dump(self.onehot_encoder, f) with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'), 'wb') as f: pickle.dump(self.count_encoder, f) with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'), 'wb') as f: pickle.dump(self.target_encoder, f) with open( os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'), 'w') as f: json.dump(self.ordinal_encoder_features, f) with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'), 'w') as f: json.dump(self.onehot_encoder_features, f) with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'), 'w') as f: json.dump(self.count_encoder_features, f) with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'), 'w') as f: json.dump(self.target_encoder_features, f) def load_encoder(self, logdir=None): with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f: pickle.dump(self.ordinal_encoder, f) with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f: pickle.dump(self.onehot_encoder, f) with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f: pickle.dump(self.count_encoder, f) with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f: pickle.dump(self.target_encoder, f) with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'), 'w') as f: json.dump(self.ordinal_encoder_features, f) with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'), 'w') as f: json.dump(self.onehot_encoder_features, f) with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'), 'w') as f: json.dump(self.count_encoder_features, f) with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'), 'w') as f: json.dump(self.target_encoder_features, f)