def run(dataset_version, params): train, val, test = load_data(dataset_version) X = train.drop(columns='target_pct_vunerable') y = train.target_pct_vunerable # Will use this as local val score and compare with CV score X_val = val.drop(columns='target_pct_vunerable') y_val = val.target_pct_vunerable X_test = test.copy() # Create categorical encoder cat_cols = X.select_dtypes('object').columns.tolist() enc = TargetEncoder(cols=cat_cols) # Tune no. estimators on validation set X_train = enc.fit_transform(X, y) X_val = enc.transform(X_val) model = lgb.LGBMRegressor(**params) model.fit(X_train, y, eval_set=[(X_val, y_val)], eval_metric='rmse', verbose=25, early_stopping_rounds=50) params.update({'n_estimators': model.best_iteration_}) # Combine validation set back with train set data = pd.concat([train, val], axis=0, sort=False) X = data.drop(columns='target_pct_vunerable') y = data.target_pct_vunerable X = enc.fit_transform(X, y) model = lgb.LGBMRegressor(**params) model.fit(X, y) # Make a submission file X_test = enc.transform(X_test) test_preds = model.predict(X_test) sub = pd.DataFrame({'ward': X_test.index, y.name: test_preds}) now = datetime.now() fname = f'lgbm_{data_version}_{now.year}-{now.month}-{now.day}--{now.hour}-{now.minute}.csv' fname sub.to_csv('../data/submissions/lgbm_best_reproduce.csv', index=False)
def prepare_df(df, columns, target): ''' Prepares a pd.DataFrame by turning missing scikit-learn preprocessors into "None" strings and performs target encoding at the input columns. Parameters: ----------- df: pd.DataFrame Contains a pd.DataFrame with the generated meta-data. columns: list Contains a list with the columns that contain scikit-learn estimators and scikit-learn preprocessors. target: str Contains a string that represents the name of the column that is the target of the dataset. Returns: -------- pd.DataFrame Contains adjusted pd.DataFrame. ''' df = deepcopy(df) df = df.reset_index(drop=True) df = df.drop_duplicates() y = df[target] for column in ['component_1', 'component_2', 'component_3']: df[column] = df[column].apply(lambda x: nan_to_none(x)) for column in columns: df[column] = df[column].astype('category') df['{}_codes'.format(column)] = df[column].cat.codes enc = TargetEncoder(cols=[column]) df['{}_encoded'.format(column)] = enc.fit_transform(df[column], y) return df
def getTestTrainSlipt(self): ## If both testX and testTrainSplit are not passed throw exception. if ((self.testX is None) and (self.testTrainSplit is None)): raise Exception("Please pass testX or testTrainSplit") if (self.targetEncodeCols): for col in self.targetEncodeCols: encoder = TargetEncoder() self.X[col] = encoder.fit_transform(self.X[col]) if (self.testX): self.testX[col] = encoder.fit_transform(self.testX[col]) if (self.testTrainSplit): X_train, X_test, y_train, y_test = train_test_split( self.X, self.Y, test_size=self.testTrainSplit, random_state=7) return X_train, X_test, y_train, y_test else: return self.X, self.testX, self.Y, self.testY
def getTestTrainSlipt(self): ## If both testX and testTrainSplit are not passed throw exception. if ((self.testX is None) and (self.testTrainSplit is None)): raise Exception("Please pass testX or testTrainSplit") ## If targetEncodeCols is given first target encode them. if (self.targetEncodeCols): for col in self.targetEncodeCols: encoder = TargetEncoder() self.X[col] = encoder.fit_transform(self.X[col], self.Y) if(self.testX and self.testY): self.testX[col] = encoder.fit_transform(self.testX[col], self.testY) if ((self.testX is not None) and (self.testTrainSplit is None)): return self.X, self.testX, self.Y, self.testY ## If stratify, smote and testTrainSplits are not passed, then just return. if (not self.stratify and not self.applySmote and not self.testTrainSplit): return self.X, self.testX, self.Y, self.testY # If startify flag is passed then stratify it using Y variable. startifyVar = self.Y if self.stratify else None X_train, X_test, y_train, y_test = train_test_split(self.X, self.Y, stratify=startifyVar, test_size=self.testTrainSplit, random_state = 7) if (not self.applySmote and not self.underSample): return X_train, X_test, y_train, y_test else: X_train = X_train if self.testTrainSplit is not None else self.X y_train = y_train if self.testTrainSplit is not None else self.Y X_test = X_test if self.testTrainSplit is not None else self.testX y_test = y_test if self.testTrainSplit is not None else self.testY if (self.applySmote): sm = SMOTE(sampling_strategy=self.sampling) X_train_res, y_train_res = sm.fit_sample(X_train, y_train) return X_train_res, X_test, y_train_res, y_test if (self.underSample): underSampler = RandomUnderSampler(sampling_strategy=self.sampling) X_train_res, y_train_res = underSampler.fit_sample(X_train, y_train) return X_train_res, X_test, y_train_res, y_test
def transform(self, X): if self.aliases: X[self.aliases] = X[self.cols] self.cols = self.aliases t_enc = TargetEncoder(cols=self.cols) X = t_enc.fit_transform(X, X[self.target_col]) if not self.ordinal_transform: return X o_enc = OrdinalEncoder() X[self.cols] = o_enc.fit_transform(X[self.cols]) return X
class EntityEmbeddingTree(BaseEstimator, TransformerMixin): def __init__(self, *, numeric_columns, categorical_columns): self.__numeric_columns = numeric_columns self.__categorical_columns = categorical_columns self.__target_encoder, self.__one_hot_encoder = [ None for _ in range(2) ] self.__max_target, self.__max_param = [None for _ in range(2)] self.__clf = None def fit(self, X, y): X = X.copy(deep=True) y = y.copy(deep=True) self.__target_encoder = TargetEncoder() X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.fit_transform( X[self.__categorical_columns], y) self.__max_target, self.__max_param = optimize_rf(X, y) self.__clf = RandomForestClassifier( min_samples_leaf=max( min(self.__max_param["min_samples_leaf"], 1.0), 0), n_estimators=max(int(round(self.__max_param["n_estimators"])), 1)) self.__clf.fit(X, y) gc.collect() return self def transform(self, X): X = X.copy(deep=True) X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.transform( X[self.__categorical_columns]) gc.collect() return pd.DataFrame(self.__clf.apply(X)).astype(str) def fit_transform(self, X, y=None, **fit_params): self.fit(X=X, y=y) return self.transform(X)
from category_encoders import TargetEncoder # In[189]: new_df # In[197]: encoder = TargetEncoder() encoder.fit_transform(new_df['Sex'],new_df['Survived']) # In[198]: encoder = TargetEncoder() encoder.fit_transform(new_df['Embarked'],new_df['Survived']) # In[ ]:
# 单词特征的特征散列化 def hash_features(word_list, m): output = [0] * m for word in word_list: index = hash_fcn(word) % m output[index] += 1 return output # 带符号的特征散列化 def hash_features(word_list, m): output = [0] * m for word in word_list: index = hash_fcn(word) % m sign_bit = sign_hash(word) % 2 if sign_bit == 0: output[index] -= 1 else: output[index] += 1 return output h = FeatureHasher(n_features=m, input_type="string") f = h.trasnform(df["feat"]) enc = TargetEncoder(cols=['Name_of_col', 'Another_name']) training_set = enc.fit_transform(X_train, y_train) enc = LeaveOneOutEncoder(cols=['Name_of_col', 'Another_name']) training_set = enc.fit_transform(X_train, y_train) enc = WOEEncoder(cols=['Name_of_col', 'Another_name']) training_set = enc.fit_transform(X_train, y_train)
Exited 0 7963 1 2037 Name: Geography, dtype: int64 ''' ############################################################################### # 3. Data Preprocessing # ############################################################################### # Encoding Categorical Variables l = LabelEncoder() df['Gender'] = l.fit_transform(df['Gender']) encoder = TargetEncoder() df['country'] = encoder.fit_transform(df['Geography'], df['Exited']) df.drop(['Geography'], inplace = True, axis = 1) # Spliting into dependent and independent vectors x = df.drop(['Exited'], axis = 1) y = df.Exited # y = y.values.reshape(-1,1) # Standard Scaling S = StandardScaler() x = S.fit_transform(x) ############################################################################### # 4. Splitting the dataset into training set and test set #
X, y, stratify=y, random_state=42 ) pd.Series(train_mod[['Crop_Type','Crop_Damage']].groupby(['Crop_Type']).count()/88858) clf.fit(X_train, y_train) submission = sample.copy() submission['Crop_Damage'] = bc.predict(test_x) submission.to_csv('bc1.csv',index = False) from category_encoders import TargetEncoder encoder = TargetEncoder() t = encoder.fit_transform(train_mod['Crop_Type'], train_mod['Crop_Damage']) from xgboost import XGBClassifier from sklearn.ensemble import BaggingClassifier bc = BaggingClassifier(base_estimator =XGBClassifier(750), n_estimators = 15, verbose= 20, bootstrap = False, max_features = 1 ) bc.fit(X, y)
} with timer('training'): cv_results = [] val_series = y_train.copy() test_df = pd.DataFrame() feat_df = pd.DataFrame(index=X_train.columns) for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx] y_trn = y_train[trn_idx] X_val = X_train.iloc[val_idx] y_val = y_train[val_idx] print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): te = TargetEncoder() X_trn = te.fit_transform(X_trn, y_trn) X_val = te.transform(X_val) X_test_ = te.transform(X_test) X_trn.fillna(-9999) X_val.fillna(-9999) X_test_.fillna(-9999) with timer('fit'): model = lgb.LGBMClassifier(**lgb_params) model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] val_series.iloc[val_idx] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_test_)[:, 1] feat_df[i] = model.feature_importances_
train_X catCols = [ cname for cname in train_X.columns if train_X[cname].dtype == 'object' ] catCols train_X_cat = train_X[catCols].copy() val_X_cat = val_X[catCols].copy() from sklearn.impute import SimpleImputer from category_encoders import TargetEncoder simple_imputer = SimpleImputer(strategy='most_frequent') target_encoder = TargetEncoder() train_X_targetenc = target_encoder.fit_transform(train_X_cat, train_y) val_X_targetenc = target_encoder.transform(val_X_cat) train_X_labelenc def score_dataset(X_train, X_valid, y_train, y_valid): model = LinearRegression() model.fit(X_train, y_train) preds = model.predict(X_valid) return np.sqrt(metrics.mean_squared_error(y_valid, preds)) simple_imputer = SimpleImputer() numCols = [ cname for cname in train_X.columns if train_X[cname].dtype != "object" ]
"""Encoding categorical variables""" !pip install --upgrade category_encoders #encoding categorical data Gender from sklearn.preprocessing import LabelEncoder l = LabelEncoder() train.loc[:,'Gender'] = l.fit_transform(train.loc[:,'Gender']) # train.loc[:, '12th Completion year'] = l.fit_transform(train.loc[:, '12th Completion year']) # train.loc[:, '10th Completion Year'] = l.fit_transform(train.loc[:, '10th Completion Year']) train.loc[:,'Performance']=l.fit_transform(train.loc[:,'Performance']) from category_encoders import TargetEncoder encoder = TargetEncoder() train['Specialization in study'] = encoder.fit_transform(train['Specialization in study'], train['Performance']) # train['10Y'] = encoder.fit_transform(train['10th Completion Year'], train['Performance']) # train['12Y'] = encoder.fit_transform(train['12th Completion year'], train['Performance']) encoder = TargetEncoder() train['Year of Completion of college'] = encoder.fit_transform(train['Year of Completion of college'], train['Performance']) encoder = TargetEncoder() train['12th Completion year'] = encoder.fit_transform(train['12th Completion year'], train['Performance']) encoder = TargetEncoder() train['10th Completion Year'] = encoder.fit_transform(train['10th Completion Year'], train['Performance']) train.head(5) train.describe()
pd_data = pd_data.drop(columns=['XINGBIE', 'HYZK']) scaler_3 = TargetEncoder(cols=['ZHIYE', 'DWJJLX', 'DWSSHY']) # print(pd_data.head()) # # print(pd_data.isnull().sum()) # # exit(0) train_data = pd_data[pd_data['id'].isin(train_data_ids)] test_data = pd_data[~pd_data['id'].isin(train_data_ids)] # exit(0) train_data = scaler_3.fit_transform(train_data, train_data['label']) test_data = scaler_3.transform(test_data) # print(train_data.columns) features = [col for col in train_data.columns if col not in del_columns] # exit(0) x_train = np.array(train_data[features]) y_train = np.array(train_data['label']) x_test = np.array(test_data[features]) folds = StratifiedKFold(n_splits=6, shuffle=True, random_state=111) kfolds = folds.split(x_train, y_train) oof_lgb = np.zeros((len(train_data), 2)) predictions_lgb = np.zeros((len(test_data), 2))
'temp_min3', 'brzina_vjetra3', 'tlak_zraka3', 'oblaci_pokrice3', 'oborine_mogucnost3', 'temp_prosjek7', 'temp_max7', 'temp_min7', 'brzina_vjetra7', 'tlak_zraka7', 'oblaci_pokrice7', 'oborine_mogucnost7', 'index_vrucine1', 'index_vrucine3', 'index_vrucine7',] kategoricke = ['nacin_rezervacije', 'status_rezervacije', 'vrsta_sobe', 'kanal', 'prognoza3', 'prognoza1', 'prognoza7', 'sif_usluge', 'tip_ro', 'tip_garancije', 'lead_time_dani'] print(len(numericke)) print(len(kategoricke)) # SIMPLE IMPUTER za null i vrijednosti koje nedostaju num_pipeline = Pipeline([('impute', SimpleImputer(strategy='mean'))]) kat_pipeline = Pipeline([('impute', SimpleImputer(strategy='most_frequent'))]) final_pipeline = ColumnTransformer([('continuous', num_pipeline, numericke), ('cat', kat_pipeline, kategoricke)], remainder='passthrough') X_imputed = final_pipeline.fit_transform(X,y) print(type(X_imputed)) # TARGET ENCODING KATEGORIČKIH VARIJABLI te = TargetEncoder() X_kodirano = te.fit_transform(X_imputed, y) skalar = MinMaxScaler() X_fit = skalar.fit_transform(X_kodirano,y) rfc = XGBClassifier() rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(5), scoring='f1', min_features_to_select=1, verbose=1) rfecv.fit(X_fit, y) print('Optimalan broj značajki je: {}'.format(rfecv.n_features_)) print(np.where(rfecv.support_ == False)[0]) X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True) plt.figure(figsize=(16, 9)) plt.title('Recursive Feature Elimination sa unakrsnom validacijom', fontsize=18, fontweight='bold', pad=20)
class Encoder(): encode_methods = { 'OrdinalEncoder': OrdinalEncoder, 'OneHotEncoder': OneHotEncoder, 'CountEncoder': CountEncoder, 'TargetEncoder': TargetEncoder, } # spark_encode_methods = { # 'mean_encoder':, # 'target_encoder':, # 'label_encoder':, # 'onehot_encoder' # } # target_encoder,mean_encoder在编码时,不能够把训练集和验证机concat在一起进行编码 # label_encoder,onehot_encoder可以 def __init__(self, sparksess=None, logdir='/encoder', handle_unknown='-99999', save_encoder=False): self.spark = sparksess self.logdir = logdir self.save_encoder self.ordinal_encoder_features = [] self.onehot_encoder_features = [] self.count_encoder_features = [] self.target_encoder_features = [] self.ordinal_encoder = OrdinalEncoder( cols=self.ordinal_encoder_features, return_df=True, handle_unknown=handle_unknown) self.onehot_encoder = OneHotEncoder(cols=self.onehot_encoder_features, return_df=True, handle_unknown=handle_unknown) self.count_encoder = CountEncoder(cols=self.count_encoder_features, return_df=True, handle_unknown=handle_unknown) self.target_encoder = TargetEncoder(cols=self.target_encoder_features, return_df=True, handle_unknown=handle_unknown) def fit(self, x_train, x_val=None, y_train=None, y_val=None, method_mapper=None): """ Parameters ---------- x_train: pd.DataFrame x_val: pd.DataFrame y_train: pd.DataFrame y_val: pd.DataFrame method_mapper: dict a mapping of feature to EncodeMethod example mapping: { 'feature1': OrdinalEncoder, 'feature2': OneHotEncoder, 'feature3': CountEncoder, 'feature4': TargetEncoder, } """ for feat in method_mapper: if method_mapper[feat] == 'OrdinalEncoder': self.ordinal_encoder_features.append(feat) elif method_mapper[feat] == 'OneHotEncoder': self.onehot_encoder_features.append(feat) elif method_mapper[feat] == 'CountEncoder': self.count_encoder_features.append(feat) elif method_mapper[feat] == 'TargetEncoder': self.target_encoder_features.append(feat) else: raise ValueError( '编码方式只支持[OrdinalEncoder, OneHotEncoder, CountEncoder, TargetEncoder], 接收到%s' % feat) if self.spark is None: if len(self.ordinal_encoder_features) != 0 or len( self.onehot_encoder_features) != 0: x_whole = x_train.append(x_val) y_whole = None if not y_train is None and not y_val is None: y_whole = y_train.append(y_val) x_whole = self.ordinal_encoder.fit_transform(x_whole, y_whole) x_whole = self.onehot_encoder.fit_transform(x_whole, y_whole) x_train = x_whole[:len(x_train)] x_val = x_whole[len(x_train):] x_train = self.count_encoder.fit_transform(x_train, y_train) x_val = self.count_encoder.transform(x_val, y_val) x_train = self.target_encoder.fit_transform(x_train, y_train) x_val = self.target_encoder.transform(x_val, y_val) if self.save_encoder: self.save_encoder() return x_train, y_train, x_val, y_val def transform(self, x, y=None): x = self.ordinal_encoder.transform(x, y) x = self.onehot_encoder.transform(x, y) x = self.count_encoder.transform(x, y) x = self.target_encoder.transform(x, y) return x, y def fit_transform(self, x_train, x_val=None, y_train=None, y_val=None, method_mapper=None): """ Parameters ---------- x_train: pd.DataFrame x_val: pd.DataFrame y_train: pd.DataFrame y_val: pd.DataFrame method_mapper: dict a mapping of feature to EncodeMethod example mapping: { 'feature1': OrdinalEncoder, 'feature2': OneHotEncoder, 'feature3': CountEncoder, 'feature4': TargetEncoder, } """ self.fit(x_train, x_val, y_train, y_val, method_mapper) x_train, y_train = self.transform(x_train, y_train) if x_val is not None: x_val, y_val = self.transform(x_val, y_val) return x_train, y_train, x_val, y_val def save_encoder(self): now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) os.makedirs(os.path.join(self.logdir, now)) with open(os.path.join(self.logdir, now, 'OrdinalEncoder.pkl'), 'wb') as f: pickle.dump(self.ordinal_encoder, f) with open(os.path.join(self.logdir, now, 'OneHotEncoder.pkl'), 'wb') as f: pickle.dump(self.onehot_encoder, f) with open(os.path.join(self.logdir, now, 'CountEncoder.pkl'), 'wb') as f: pickle.dump(self.count_encoder, f) with open(os.path.join(self.logdir, now, 'TargetEncoder.pkl'), 'wb') as f: pickle.dump(self.target_encoder, f) with open( os.path.join(self.logdir, now, 'OrdinalEncoderFeatures.json'), 'w') as f: json.dump(self.ordinal_encoder_features, f) with open(os.path.join(self.logdir, now, 'OneHotEncoderFeatures.json'), 'w') as f: json.dump(self.onehot_encoder_features, f) with open(os.path.join(self.logdir, now, 'CountEncoderFeatures.json'), 'w') as f: json.dump(self.count_encoder_features, f) with open(os.path.join(self.logdir, now, 'TargetEncoderFeatures.json'), 'w') as f: json.dump(self.target_encoder_features, f) def load_encoder(self, logdir=None): with open(os.path.join(self.logdir, 'OrdinalEncoder.pkl'), 'wb') as f: pickle.dump(self.ordinal_encoder, f) with open(os.path.join(self.logdir, 'OneHotEncoder.pkl'), 'wb') as f: pickle.dump(self.onehot_encoder, f) with open(os.path.join(self.logdir, 'CountEncoder.pkl'), 'wb') as f: pickle.dump(self.count_encoder, f) with open(os.path.join(self.logdir, 'TargetEncoder.pkl'), 'wb') as f: pickle.dump(self.target_encoder, f) with open(os.path.join(self.logdir, 'OrdinalEncoderFeatures.json'), 'w') as f: json.dump(self.ordinal_encoder_features, f) with open(os.path.join(self.logdir, 'OneHotEncoderFeatures.json'), 'w') as f: json.dump(self.onehot_encoder_features, f) with open(os.path.join(self.logdir, 'CountEncoderFeatures.json'), 'w') as f: json.dump(self.count_encoder_features, f) with open(os.path.join(self.logdir, 'TargetEncoderFeatures.json'), 'w') as f: json.dump(self.target_encoder_features, f)
FrequencyEncoding(X_train, X_test, categorical_wo_version + ['SmartScreen']) utils.reduce_mem_usage(X_train) utils.reduce_mem_usage(X_test) # SmartScreenのみHasDetectionsでTargetEncodingする。 te = TargetEncoder(cols=['SmartScreen'], drop_invariant=False, handle_unknown='impute', impute_missing=True, min_samples_leaf=100, return_df=True, smoothing=1.0, verbose=1) X_train = te.fit_transform(X_train, y_train) X_test = te.transform(X_test) # AVProductStatesIdentifierのFrequencyに対してTarget Encodingしている。 pseudoTarget = 'AVProductStatesIdentifier_freq' alpha = 0.5 min_samples_leaf = 100 smooth_coeff = 1.0 impute = True for col in tqdm(categorical_wo_version): global_mean = (1 - alpha) * X_train[pseudoTarget].astype( float).mean() + alpha * X_test[pseudoTarget].astype(float).mean() summary = (1 - alpha) * X_train[[col, pseudoTarget]].groupby( [col])[pseudoTarget].agg(['mean', 'count']) + alpha * X_test[[
# 匿名特征组合 for i in range(15): for j in range(i + 1, 15): df[f'v_{i}_add_v_{j}'] = df[f'v_{i}'] + df[f'v_{j}'] df[f'v_{i}_minus_v_{j}'] = df[f'v_{i}'] - df[f'v_{j}'] df[f'v_{i}_multiply_v_{j}'] = df[f'v_{i}'] * df[f'v_{j}'] df[f'v_{i}_div_v_{j}'] = df[f'v_{i}'] / df[f'v_{j}'] df_train = df[df['is_train'] == 1] df_test = df[df['is_train'] == 0] y_train = df_train['price'] enc = TargetEncoder(cols=['regionCode', 'city', 'model', 'brand']) # 高基离散特征编码 df_train = enc.fit_transform(df_train, y_train) df_test = enc.transform(df_test) # 删除无效编码 delete_features = [ 'SaleID', 'name', 'regDate', 'offerType', 'seller', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'creatDate', 'is_train' ] for feature in delete_features: del df_train[feature] del df_test[feature] del df_test['price'] for column in df_train.columns: print(column)
def PreprocessData(df): # Remove all NaN rows df_preprocessed = df.dropna() # one hot encoding nationality encoder = TargetEncoder() df_preprocessed['Nationality Encoded'] = encoder.fit_transform( df_preprocessed['Nationality'], df_preprocessed['Overall']) df_preprocessed.pop('Nationality') # one hot encoding club (remove NaN first) encoder = TargetEncoder() df_preprocessed['Club Encoded'] = encoder.fit_transform( df_preprocessed['Club'], df_preprocessed['Overall']) df_preprocessed.pop('Club') # onehot encoding preferred foot (remove NaN first)\ encoder = TargetEncoder() df_preprocessed['Preferred Foot Encoded'] = encoder.fit_transform( df_preprocessed['Preferred Foot'], df_preprocessed['Overall']) df_preprocessed.pop('Preferred Foot') # encode Workrate encoder = TargetEncoder() df_preprocessed['Work Rate Encoded'] = encoder.fit_transform( df_preprocessed['Work Rate'], df_preprocessed['Overall']) df_preprocessed.pop('Work Rate') # encode BodyType encoder = TargetEncoder() df_preprocessed['Body Type Encoded'] = encoder.fit_transform( df_preprocessed['Body Type'], df_preprocessed['Overall']) df_preprocessed.pop('Body Type') # encode position encoder = TargetEncoder() df_preprocessed['Position Encoded'] = encoder.fit_transform( df_preprocessed['Position'], df_preprocessed['Overall']) df_preprocessed.pop('Position') # Value value = df_preprocessed.Value newvalue = [] for ITEM in value: temp = ITEM.split("€")[1] if ITEM[-1] == 'K': newvalue.append(float(temp.split("K")[0])) elif ITEM[-1] == 'M': newvalue.append(float(temp.split("M")[0])) else: newvalue.append(float(temp)) df_preprocessed['Value'] = newvalue newvalue = [] # Wage wage = df_preprocessed.Wage newwage = [] for ITEM in wage: temp = ITEM.split("€")[1] newwage.append(float(temp.split("K")[0])) df_preprocessed['Wage'] = newwage newwage = [] # Weight weight = df_preprocessed.Weight newweight = [] for ITEM in weight: newweight.append(float(ITEM.split("lbs")[0])) df_preprocessed['Weight'] = newweight newweight = [] # Height newheight = [] height = df_preprocessed.Height for ITEM in height: feet, inch = float(ITEM.split("'")[0]), float(ITEM.split("'")[1]) newheight.append(feet * 30.48 + inch * 2.54) df_preprocessed['Height'] = newheight newheight = [] # All the positions, weet niet wat we daar mee willlen positions = [ 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB' ] for position in positions: data = [] new_data = [] data = df_preprocessed[position] for ITEM in data: new_data.append( float(ITEM.split("+")[0]) + float(ITEM.split("+")[1])) df_preprocessed[position] = new_data new_data = [] return df_preprocessed
def target_encoding(x_train_cat, x_val_cat, y_train): target_encoder = TargetEncoder() scaler = StandardScaler() x_train_reference = x_train_cat x_train_target = target_encoder.fit_transform(x_train_cat, y_train) x_train_target = pd.DataFrame(scaler.fit_transform(x_train_target), columns=x_train_target.columns, index=x_train_target.index) x_train_reference = x_train_reference.join( x_train_target.add_suffix("_targetenc")) #Referenztabellen mit Encodings erstellen energietyp = x_train_reference[['energietyp', 'energietyp_targetenc']] energietyp = energietyp.drop_duplicates(subset=['energietyp']) energietyp.to_sql(name='Encoding_energietyp', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') energie_effizienzklasse = x_train_reference[[ 'energie_effizienzklasse', 'energie_effizienzklasse_targetenc' ]] energie_effizienzklasse = energie_effizienzklasse.drop_duplicates( subset=['energie_effizienzklasse']) energie_effizienzklasse.to_sql( name='Encoding_energie_effizienzklasse', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') heizung = x_train_reference[['heizung', 'heizung_targetenc']] heizung = heizung.drop_duplicates(subset=['heizung']) heizung.to_sql(name='Encoding_heizung', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') immobilienart = x_train_reference[[ 'immobilienart', 'immobilienart_targetenc' ]] immobilienart = immobilienart.drop_duplicates(subset=['immobilienart']) immobilienart.to_sql(name='Encoding_immobilienart', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') immobilienzustand = x_train_reference[[ 'immobilienzustand', 'immobilienzustand_targetenc' ]] immobilienzustand = immobilienzustand.drop_duplicates( subset=['immobilienzustand']) immobilienzustand.to_sql(name='Encoding_immobilienzustand', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') Grad_der_Verstädterung = x_train_reference[[ 'Grad_der_Verstädterung', 'Grad_der_Verstädterung_targetenc' ]] Grad_der_Verstädterung = Grad_der_Verstädterung.drop_duplicates( subset=['Grad_der_Verstädterung']) Grad_der_Verstädterung.to_sql( name='Encoding_Grad_der_Verstädterung', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') sozioökonmische_Lage = x_train_reference[[ 'sozioökonomische_Lage', 'sozioökonomische_Lage_targetenc' ]] sozioökonmische_Lage = sozioökonmische_Lage.drop_duplicates( subset=['sozioökonomische_Lage']) sozioökonmische_Lage.to_sql( name='Encoding_sozioökonmische_Lage', con=main.setup_database(r"Datenbank/ImmoDB.db"), if_exists='replace') x_val_target = target_encoder.transform(x_val_cat) x_val_target = pd.DataFrame(scaler.fit_transform(x_val_target), columns=x_val_target.columns, index=x_val_target.index) return x_train_target, x_val_target
X_val = X_train.iloc[val_idx].copy() y_val = y_train[val_idx] X_tst = X_test.copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) # with timer('weight of evidence'): # cat_cols = X_trn.select_dtypes(['object']).columns.tolist() # woe = WeightOfEvidence(cols=cat_cols, suffix='woe') # X_trn = pd.concat([X_trn, woe.fit_transform(X_trn.loc[:, cat_cols], y_trn)], axis=1) # X_val = pd.concat([X_val, woe.transform(X_val.loc[:, cat_cols])], axis=1) # X_tst = pd.concat([X_tst, woe.transform(X_tst.loc[:, cat_cols])], axis=1) with timer('target encoding'): cat_cols = X_trn.select_dtypes(['object']).columns.tolist() te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn) X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]) X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]) # with timer('calc sample weight'): # X_trn['is_test'] = 0 # X_tst['is_test'] = 1 # df = pd.concat([X_trn, X_tst]) # X = df.drop('is_test', axis=1) # y = df.is_test.ravel() # model = lgb.LGBMClassifier(**calc_weight_params) # model.fit(X, y) # proba = np.sqrt(rankdata(model.predict_proba(X)[:len(X_trn), 1])/len(X_trn)) # X_trn.drop('is_test', axis=1) # X_tst.drop('is_test', axis=1)
def run(name, feats, params, fit_params, fill=-9999): logger = getLogger(name) logger.setLevel(DEBUG) ch = StreamHandler() ch.setLevel(DEBUG) handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) train = pd.read_feather(str(TRAIN)) with timer('load datasets'): X_train, y_train, X_test, cv = load_dataset(feats) print('train:', X_train.shape) print('test :', X_test.shape) with timer('clean datasets'): # drop id cols id_cols = X_train.filter(regex='(SK_ID_CURR|SK_ID_PREV)').columns print('drop id:', id_cols.tolist()) X_train.drop(id_cols, axis=1, inplace=True) X_test.drop(id_cols, axis=1, inplace=True) # drop columns which contains many NaN ref_train = X_train.isnull().mean() > 0.95 ref_test = X_test.isnull().mean() > 0.95 nan_cols = X_train.columns[ref_train | ref_test] print('drop many nan:', nan_cols.tolist()) X_train.drop(nan_cols, axis=1, inplace=True) X_test.drop(nan_cols, axis=1, inplace=True) print('train:', X_train.shape) print('test :', X_test.shape) with timer('impute missing'): if fill == 'mean': assert X_train.mean().isnull().sum() == 0 print('fill nan with mean') X_train.fillna(X_train.mean(), inplace=True) X_test.fillna(X_train.mean(), inplace=True) else: print(f'fill nan with {fill}') X_train.fillna(fill, inplace=True) X_test.fillna(fill, inplace=True) assert X_train.isnull().sum().sum() == 0 assert X_test.isnull().sum().sum() == 0 if 'colsample_bytree' in params and params['colsample_bytree'] == 'auto': n_samples = X_train.shape[1] params['colsample_bytree'] = np.sqrt(n_samples) / n_samples print(f'set colsample_bytree = {params["colsample_bytree"]}') with timer('training'): cv_results = [] cv_df = pd.DataFrame(index=range(len(y_train)), columns=range(cv.get_n_splits())) test_df = pd.DataFrame() feat_df = None for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx].copy() y_trn = y_train[trn_idx] X_val = X_train.iloc[val_idx].copy() y_val = y_train[val_idx] X_tst = X_test.copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): cat_cols = X_trn.select_dtypes(['object']).columns.tolist() te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn) X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]) X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]) with timer('fit'): model = lgb.LGBMClassifier(**params) model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] cv_df.loc[val_idx, i] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_tst)[:, 1] if feat_df is None: feat_df = pd.DataFrame(index=X_trn.columns) feat_df[i] = model.feature_importances_ valid_score = np.mean(cv_results) message = f"""cv: {valid_score:.5f} scores: {[round(c, 4) for c in cv_results]} feats: {feats} model_params: {params} fit_params: {fit_params}""" send_line_notification(message) with timer('output results'): RESULT_DIR = OUTPUT / (timestamp() + '_' + name) RESULT_DIR.mkdir() # output cv prediction tmp = pd.DataFrame({ 'SK_ID_CURR': train['SK_ID_CURR'], 'TARGET': cv_df.mean(axis=1) }) tmp.to_csv(RESULT_DIR / f'{name}_cv.csv', index=None) # output test prediction pred = test_df.mean(axis=1).ravel() generate_submit(pred, f'{name}_{valid_score:.5f}', RESULT_DIR, compression=False) # output feature importances feat_df = (feat_df / feat_df.mean(axis=0)) * 100 feat_df.mean(axis=1).sort_values(ascending=False).to_csv(RESULT_DIR / 'feats.csv') imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50] imp[::-1].plot.barh(figsize=(20, 15)) plt.savefig(str(RESULT_DIR / 'feature_importances.pdf'), bbox_inches='tight') print('=' * 60) print(message) print('=' * 60)