def _feature_encode(self, data): dummy_cols = [] for col in data.cat_features: # merge categorical features with low frequencies if data.train_df[col].nunique() / len(data.train_df[col]) < 0.1: for name, count in data.train_df[col].value_counts().items(): if count / len(data.train_df[col]) < 0.01: data.train_df[col].replace(name, 'Rare', inplace=True) if data.test_df[col].nunique() / len(data.test_df[col]) < 0.1: for name, count in data.test_df[col].value_counts().items(): if count / len(data.test_df[col]) < 0.01: data.test_df[col].replace(name, 'Rare', inplace=True) # target-encode categorical features with high number of unique values if data.train_df[col].nunique() > 10: from category_encoders.target_encoder import TargetEncoder encoder = TargetEncoder(cols=col) encoder.fit(data.train_df[col], data.train_df[data.target_var]) data.train_df[col] = encoder.transform(data.train_df[col]) data.test_df[col] = encoder.transform(data.test_df[col]) else: dummy_cols.append(col) # create dummy variables from categorical features with low number of unique values data.train_df = pd.get_dummies(data.train_df, columns=dummy_cols, drop_first=True) data.test_df = pd.get_dummies(data.test_df, columns=dummy_cols, drop_first=True) data.target_df = data.train_df[data.target_var]
def target_encode(): from category_encoders.target_encoder import TargetEncoder tr = pd.read_csv('./data/tr.csv') te = pd.read_csv('./data/te.csv') y = tr['TARGET'].astype(int) tr.drop(['TARGET'], axis=1, inplace=True) encode_model = TargetEncoder(verbose=1, min_samples_leaf=100) cate_col = [] for col in tr.columns: if tr[col].dtype == 'object': cate_col.append(col) encode_model.fit(tr, y) tr = encode_model.transform(tr) te = encode_model.transform(te) tr = tr[cate_col] te = te[cate_col] tr.columns = ['TE_' + col for col in cate_col] te.columns = ['TE_' + col for col in cate_col] print(tr.info()) print(te.info()) tr.to_csv("./data/target_tr.csv", index=False) te.to_csv("./data/target_te.csv", index=False)
def fit_target_encoder(train_imputed_categorical_df: pd.DataFrame, train_transformed_target: pd.DataFrame): target_encoder = TargetEncoder( cols=train_imputed_categorical_df.columns.values) target_encoder.fit(X=train_imputed_categorical_df, y=train_transformed_target) return target_encoder
def label_encoding_fit(X, y, cols): '''Label - Takes X_train, y_train, columns to be encoded, saves encoded files ''' for col in cols: print("Encoding for column: {}".format(col)) encoder = TargetEncoder(cols=[col]) encoder.fit(X[col], y) write_encoder(encoder, 'label', col) return
def _encode(): train = pd.read_feather('./data/application_train.preprocessed.feather') test = pd.read_feather('./data/application_test.preprocessed.feather') df = pd.concat([train, test], sort=False).reset_index(drop=True) cols = [ 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', # Level of highest education the client achieved, # noqa 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'NAME_CONTRACT_TYPE', # Identification if loan is cash or revolving, ] encoder = TargetEncoder(cols=cols) encoder.fit(df[cols], df['TARGET']) res = encoder.transform(df[cols]) res.columns = ['{}_ENC'.format(c) for c in res.columns] res['SK_ID_CURR'] = df['SK_ID_CURR'] res.to_feather('./data/app.enc.feather')
class target_enc(BaseEstimator, TransformerMixin): def __init__(self, columns): self.columns = columns def fit(self, df, y=None): self.encoder = TargetEncoder(handle_unknown='value', cols=self.columns) self.encoder = self.encoder.fit(df, y) return self def transform(self, df, y=None): df_ = df.copy() return self.encoder.transform(df_, y)
def target_encoder(self): te = TargetEncoder( cols=self.train_imputed_categorical_df.columns.values) te.fit(X=self.train_imputed_categorical_df, y=self.train_transformed_target) return te
class FeatureExtractor(BaseEstimator, TransformerMixin): def __init__(self): self.cols_to_keep = [ 'city_origin', 'host_total_listings_count', 'host_since', 'latitude', 'amenities', 'longitude', 'room_type', 'accommodates', 'bathrooms', 'beds', 'guests_included', 'minimum_nights', 'number_of_reviews', 'review_scores_rating', 'cancellation_policy', 'reviews_per_month', 'instant_bookable', 'property_type' ] self.num_na = [ 'host_total_listings_count', 'bathrooms', 'beds', 'review_scores_rating', 'reviews_per_month' ] self.cat_na = ['host_since', 'property_type'] self.amenities_to_keep = [ 'Well-lit path to entrance', 'translation missing: en.hosting_amenity_50', 'Paid parking on premises', 'No stairs or steps to enter', 'Private living room', 'Self check-in', 'Pets allowed', 'Free street parking', 'Buzzer/wireless intercom', 'Free parking on premises', 'Extra pillows and blankets', 'Dishwasher', 'Patio or balcony', 'Cable TV', 'Luggage dropoff allowed', 'Smoking allowed', 'Paid parking off premises', 'Carbon monoxide detector', 'Internet', 'Long term stays allowed', 'Dryer', 'Microwave', 'Host greets you', 'Lock on bedroom door', 'First aid kit', 'Coffee maker', 'Oven', 'Private entrance', 'Family/kid friendly', 'Fire extinguisher', 'Stove', 'Bed linens', 'Cooking basics', 'Elevator', 'Dishes and silverware', 'Refrigerator', 'Air conditioning', 'Smoke detector', 'Iron', 'Hot water', 'Laptop friendly workspace', 'Shampoo', 'TV' ] self.inmputer = SimpleImputer() def fit(self, X_df, y=None): def regroup_cat(X, liste): if X not in liste: return ('other') else: return (X) self.prop_to_keep = [ 'Apartment', 'Serviced apartment', 'Condominium', 'Loft' ] self.prop_transformer = TargetEncoder() self.prop_transformer.fit( X_df['property_type'].apply( lambda x: regroup_cat(x, self.prop_to_keep)), y) self.pol_to_keep = [ 'flexible', 'strict_14_with_grace_period', 'moderate', 'moderate_new' ] self.pol_transformer = TargetEncoder() self.pol_transformer.fit( X_df['cancellation_policy'].apply( lambda x: regroup_cat(x, self.pol_to_keep)), y) self.room_transformer = OrdinalEncoder() self.room_transformer.fit(X_df['room_type']) self.city_transformer = OneHotEncoder(handle_unknown='ignore') self.city_transformer.fit(pd.DataFrame(X_df['city_origin'])) # numeric_transformer = Pipeline(steps = [('impute', SimpleImputer(strategy='median'))]) return self def transform(self, X_df): def regroup_cat(X, liste): if X not in liste: return ('other') else: return (X) def replace_all(text, dic): for i, j in dic.items(): text = text.replace(i, j) return text X_new = X_df[self.cols_to_keep].copy() #date X_new['host_since'] = pd.to_datetime(X_new['host_since'], format='%Y-%m-%d').dt.year #amenities amenities = X_new['amenities'].apply( lambda x: replace_all(x, { '{': '', '"': '', '}': '' })).str.get_dummies(sep=',') X_new = pd.merge(X_new, amenities[self.amenities_to_keep], left_index=True, right_index=True) X_new.drop(['amenities'], axis=1, inplace=True) #fill missing X_new[self.num_na] = SimpleImputer().fit_transform(X_new[self.num_na]) X_new[self.cat_na] = SimpleImputer( strategy='most_frequent').fit_transform(X_new[self.cat_na]) #cat encoding ## concellation policy encoding X_new['cancellation_policy'] = self.pol_transformer.transform( X_new['cancellation_policy'].apply( lambda x: regroup_cat(x, self.pol_to_keep))) ## proprety type X_new['property_type'] = self.prop_transformer.transform( X_new['property_type'].apply( lambda x: regroup_cat(x, self.prop_to_keep))) ##room type X_new['room_type'] = self.room_transformer.transform( X_new['room_type']) ###city_origin_encoding X_new = pd.concat( [ X_new.reset_index(drop=True).drop(['city_origin'], axis=1), pd.DataFrame( self.city_transformer.transform( pd.DataFrame(X_new['city_origin'])).toarray()) ], axis=1) #X_new.drop(['city_origin'], axis=1, inplace=True) #instant bookable X_new['instant_bookable'] = X_new['instant_bookable'].replace({ "t": 1, "f": 0 }) return X_new