def target_encoding(train, target, test=None, feat_to_encode=None, smooth=0.2, random_state=9527): print('Target encoding...') train.sort_index(inplace=True) target = train.pop(target) if feat_to_encode is None: feat_to_encode = train.columns.tolist() smoothing = smooth oof = pd.DataFrame([]) for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True).split(train, target): ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx]) oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]), ignore_index=False) ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train, target) train = oof.sort_index() if test is not None: test = ce_target_encoder.transform(test) features = list(train) print('Target encoding done!') return train, test, features, target
def run(dataset_version, params): train, val, test = load_data(dataset_version) X = train.drop(columns='target_pct_vunerable') y = train.target_pct_vunerable # Will use this as local val score and compare with CV score X_val = val.drop(columns='target_pct_vunerable') y_val = val.target_pct_vunerable X_test = test.copy() # Create categorical encoder cat_cols = X.select_dtypes('object').columns.tolist() enc = TargetEncoder(cols=cat_cols) # Tune no. estimators on validation set X_train = enc.fit_transform(X, y) X_val = enc.transform(X_val) model = lgb.LGBMRegressor(**params) model.fit(X_train, y, eval_set=[(X_val, y_val)], eval_metric='rmse', verbose=25, early_stopping_rounds=50) params.update({'n_estimators': model.best_iteration_}) # Combine validation set back with train set data = pd.concat([train, val], axis=0, sort=False) X = data.drop(columns='target_pct_vunerable') y = data.target_pct_vunerable X = enc.fit_transform(X, y) model = lgb.LGBMRegressor(**params) model.fit(X, y) # Make a submission file X_test = enc.transform(X_test) test_preds = model.predict(X_test) sub = pd.DataFrame({'ward': X_test.index, y.name: test_preds}) now = datetime.now() fname = f'lgbm_{data_version}_{now.year}-{now.month}-{now.day}--{now.hour}-{now.minute}.csv' fname sub.to_csv('../data/submissions/lgbm_best_reproduce.csv', index=False)
def TargetEncode(data,target): #Select all categorical columns data_to_encode=data.select_dtypes(include=['object']) print('Data to be encoded: ') cols=list(data_to_encode.columns) print(len(cols)) cols='\n'.join(cols) print(cols) print('\n') print('\n') #For each column, encode using target encoder cols=list(data_to_encode.columns) model=TargetEncoder().fit(X=data[cols],y=data[target]) #File where the target encoding model is saved filename="targetencodemodel.sav" #Open file in binary mode f=open(filename,'wb') #Dump model to file pickle.dump(model,f) f.close() print("Model saved in ",filename) print("\n") print("\n") #Read Model from file # f=open(filename,'rb') # model1=pickle.load(f) # f.close() # print("Model Loaded") # print(model1) # print('\n') data[cols]=model.transform(X=data[cols]) #Return encoded data return data
class DFMeanEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = TargetEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat( [new_X, self.model.transform(X[self.transform_cols])], axis=1) return new_X def fit_transform(self, X, y): return self.fit(X, y).transform(X)
def data_prepare(self): self.__train_feature_before = pd.read_csv( os.path.join(self.__input_path, "train_feature_before_df.csv")) self.__train_feature_after = pd.read_csv( os.path.join(self.__input_path, "train_feature_after_df.csv")) self.__train = pd.concat( [self.__train_feature_before, self.__train_feature_after]) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_feature_df.csv")) self.__train_label = self.__train["TARGET"].copy() self.__train_feature = (self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1)).copy() self.__test_feature = self.__test[ self.__train_feature.columns.tolist()].copy() self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() encoder = TargetEncoder() encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = encoder.transform( self.__train_feature[self.__categorical_columns])
class CategoricalPreprocessing(BaseEstimator, TransformerMixin): def __init__(self): self.mode_imputer = SimpleImputer(strategy="most_frequent") self.cat_cols = [ 'home_ownership', 'purpose', 'addr_state', 'initial_list_status' ] self.target_encoder = TargetEncoder(handle_missing='return_nan', handle_unknown='return_nan') def fit(self, X, y=None): self.mode_imputer.fit(X[self.cat_cols]) self.target_encoder.fit(X["zip_code"], y) return self def transform(self, X, y=None): Xc = X.copy() # encode emp_length lookup = { '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10 } Xc["emp_length"] = Xc["emp_length"].replace(lookup) # issue date Xc["issue_d"] = pd.to_datetime(Xc["issue_d"]) tmp = Xc[ "issue_d"].values # keep a copy of the raw date for when we transform earliest credit line Xc["issue_d"] = ( Xc["issue_d"] - datetime.datetime(2000, 1, 1)).astype('timedelta64[M]') # earliest credit line Xc["earliest_cr_line"] = pd.to_datetime(Xc["earliest_cr_line"]) Xc["earliest_cr_line"] = ( tmp - Xc["earliest_cr_line"]).astype('timedelta64[M]') # imputation for home_ownership, purpose, addr_state, and initial_list_status Xc[self.cat_cols] = self.mode_imputer.transform(Xc[self.cat_cols]) # encode zip code Xc["zip_code"] = self.target_encoder.transform(Xc["zip_code"]) return Xc def fit_transform(self, X, y=None): return self.fit(X, y).transform(X)
def target_encoding(X_train, y_train, X_test, cols, cv_id): cols = list(cols) train_new = X_train.copy() test_new = X_test.copy() test_new[:] = 0 cv = PredefinedSplit(cv_id) X_train.index = X_train.index.astype(int) for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()): enc = TargetEncoder(cols=cols) enc.fit(X_train.iloc[trn_idx], y_train[trn_idx]) train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx]) test_new += enc.transform(X_test) test_new /= cv.get_n_splits() train_new = train_new[cols] test_new = test_new[cols] train_new.columns = train_new.columns + '_target' test_new.columns = test_new.columns + '_target' print(list(train_new.columns)) return train_new, test_new
def fit_model(X_train, y_train, X_val, y_val, **params): if args.model == "catboost": if args.gpu: model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True, task_type="GPU") else: model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True, task_type="CPU") model.fit(X_train, y_train, cat_features=cat_cols, early_stopping_rounds=config.EARLY_STOPPING_ROUNDS, eval_set=(X_val, y_val), plot=False) return model, None elif args.model == "xgboost": te = TargetEncoder(cols=cat_cols, smoothing=300) te.fit(X_train, y_train) X_train = te.transform(X_train) X_val = te.transform(X_val) if args.gpu: model = XGBRegressor(**params, random_state=42, verbosity=1, tree_method='gpu_hist', gpu_id=0, predictor="cpu_predictor") else: model = XGBRegressor(**params, random_state=42, verbosity=1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric="rmse", early_stopping_rounds=config.EARLY_STOPPING_ROUNDS, verbose=True) return model, te else: raise ValueError("Invalid value passed to model. Has to be either CatBoost or XGBoost.")
def categorical_encoding(df_X, y, cat_vars, id_train, method=None): if method is None: return df_X.values, df_X.columns target_enc = TargetEncoder(cols=cat_vars, drop_invariant=False, return_df=True, impute_missing=False, handle_unknown='error') target_enc.fit(df_X.iloc[id_train], pd.Series(y).iloc[id_train]) df_X = target_enc.transform(df_X) return df_X.values, df_X.columns
def _create_feature(cls, conf) -> pd.DataFrame: df = Application.get_df(conf) # fit with train data and transform both data categorical_columns = [ col for col in df.columns if df[col].dtype == 'object' ] train_df = df[df['TARGET'].notnull()].copy() test_df = df[df['TARGET'].isnull()].copy() feature = pd.DataFrame() folds = StratifiedKFold(**conf.model.kfold_params) for n_fold, (train_idx, valid_idx) in tqdm( enumerate( folds.split(train_df[categorical_columns], train_df['TARGET'])), total=conf.model.kfold_params.n_splits): encoder = TargetEncoder(cols=categorical_columns).fit( train_df.iloc[train_idx][categorical_columns + ['SK_ID_CURR']], train_df.iloc[train_idx]['TARGET']) valid_te = encoder.transform( train_df.iloc[valid_idx][categorical_columns + ['SK_ID_CURR']]).rename(columns={ col: f"{col}_target_encode" for col in categorical_columns }) test_te = encoder.transform( test_df[categorical_columns + ['SK_ID_CURR']]).rename(columns={ col: f"{col}_target_encode" for col in categorical_columns }) feature = feature.append(valid_te, sort=True).append(test_te, sort=True) # take mean of oof target mean for test data feature = feature.groupby('SK_ID_CURR').mean() return feature
class EntityEmbeddingTree(BaseEstimator, TransformerMixin): def __init__(self, *, numeric_columns, categorical_columns): self.__numeric_columns = numeric_columns self.__categorical_columns = categorical_columns self.__target_encoder, self.__one_hot_encoder = [ None for _ in range(2) ] self.__max_target, self.__max_param = [None for _ in range(2)] self.__clf = None def fit(self, X, y): X = X.copy(deep=True) y = y.copy(deep=True) self.__target_encoder = TargetEncoder() X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.fit_transform( X[self.__categorical_columns], y) self.__max_target, self.__max_param = optimize_rf(X, y) self.__clf = RandomForestClassifier( min_samples_leaf=max( min(self.__max_param["min_samples_leaf"], 1.0), 0), n_estimators=max(int(round(self.__max_param["n_estimators"])), 1)) self.__clf.fit(X, y) gc.collect() return self def transform(self, X): X = X.copy(deep=True) X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.transform( X[self.__categorical_columns]) gc.collect() return pd.DataFrame(self.__clf.apply(X)).astype(str) def fit_transform(self, X, y=None, **fit_params): self.fit(X=X, y=y) return self.transform(X)
def frontend_preproc(df, y): ''' Function that produces the preprocessing of the DataFrame before applying the model on the front-end. :df: concat of df_input by the user and X features of the model :y: target ''' ### Feature Engineering ohe_cols = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors'] # OHE ohe = OneHotEncoder(categories='auto') feature_arr = ohe.fit_transform(df[ohe_cols]).toarray() feature_labels = ohe.categories_ # Using a dictionary to produce all the new OHE columns feature_cols = [] for k, v in dict(zip(ohe_cols, feature_labels)).items(): for i in v: el = k + '_' + str(i) feature_cols.append(el) ohe_features = pd.DataFrame(feature_arr, columns=feature_cols) df = pd.concat([df, ohe_features], axis=1) df = df.drop(ohe_cols, axis=1) # Target Encoding cat_cols = df.select_dtypes(exclude=["number"]).columns cols_encoded = list(map(lambda c: c + '_encoded', cat_cols)) t_encoder = TargetEncoder() t_encoder.fit(df[1:][cat_cols], y) df[cols_encoded] = t_encoder.transform(df[cat_cols]) df = df.drop(cat_cols, axis=1) # Column Transformation: QuantileTransformer qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=33) data = qt.fit_transform(df) df = pd.DataFrame(data, columns=df.columns) return df
def clean_train_data_target_encoded(data): #uses target encodier instead data = data.reset_index(drop=True) train_y = data.iloc[:,-1] train_y = train_y.reset_index(drop=True) train_X = data.iloc[:,:-1] train_X = process_features(train_X) encoder = TargetEncoder(cols = ["Hair Color", "Wears Glasses","University Degree","Gender","Country","Profession", "Housing Situation", "Satisfation with employer"], smoothing = 300) encoder.fit(train_X,train_y) data2 = pd.concat([encoder.transform(train_X,train_y).reset_index(drop=True),train_y.reset_index(drop=True)],axis=1) #data2 = data2.fillna(method="ffill") return (data2,encoder)
class ScatterPlot(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path self.__train = None self.__train_feature, self.__train_label = [None for _ in range(2)] self.__encoder = None self.__pca, self.__t_sne = [None for _ in range(2)] def data_read(self): self.__train = pd.read_csv(os.path.join(self.__input_path, "train.csv")) self.__train = self.__train.drop(["id"], axis=1) self.__train_feature, self.__train_label = (self.__train.drop( ["target"], axis=1).copy(deep=True), self.__train["target"].copy(deep=True)) self.__train_feature = self.__train_feature.astype(str) def data_prepare(self): self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature, self.__train_label) self.__train_feature = self.__encoder.transform(self.__train_feature) self.__pca = PCA(n_components=2, random_state=7) self.__train_feature = self.__pca.fit_transform(self.__train_feature) self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"]) # self.__t_sne = TSNE(verbose=True, random_state=7) # self.__train_feature = self.__t_sne.fit_transform(self.__train_feature) # self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"]) def scatter_plot(self): _, ax = plt.subplots(figsize=(16, 9)) ax = sns.scatterplot(x="col_1", y="col_2", hue=self.__train_label, data=self.__train_feature, ax=ax) ax.get_figure().savefig(os.path.join(self.__output_path, "PCA.png"))
def ProcessRawData(df, schemaCols=None): medianSimpleImputer = SimpleImputer(strategy='median') standardScaler = preprocessing.StandardScaler() # Adding extra features AgeLog and HeightLog df['AgeLog'] = np.log(df['Age'].values) df['HeightLog'] = np.log(df['Body Height [cm]'].values) # Fill missing values df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']] = medianSimpleImputer.fit_transform(df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']].values) # Scale numeric columns 1 df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']] = standardScaler.fit_transform(df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']].values) # Scale numeric columns 2 if 'Income in EUR' in df.columns: global YScaler YScaler = preprocessing.StandardScaler() df[['Income in EUR']] = YScaler.fit_transform(df[['Income in EUR']].values) # Reducing complexity of features df.Profession = list(df.Profession.map(S2)) # To be used while writing results to CSV instances = df['Instance'].values df = df.drop(['Instance'], axis=1) print('Columns available 1 - ', df.columns) # Target encoding the data - could've been done with a single encoder object, will try later, if (schemaCols is None): # condition to skip fitting on Prediction dataset and only transform then global t1, t2, t3, t4, t5 t1 = TargetEncoder() t2 = TargetEncoder() t3 = TargetEncoder() t4 = TargetEncoder() t5 = TargetEncoder() t1.fit(df.Country.values, df['Income in EUR'].values) t2.fit(df.Profession.values, df['Income in EUR'].values) t3.fit(df.Gender.values, df['Income in EUR'].values) t4.fit(df['University Degree'].values, df['Income in EUR'].values) t5.fit(df['Hair Color'].values, df['Income in EUR'].values) df.Country = t1.transform(df.Country.values) df.Profession = t2.transform(df.Profession.values) df.Gender = t3.transform(df.Gender.values) df['University Degree'] = t4.transform(df['University Degree'].values) df['Hair Color'] = t5.transform(df['Hair Color'].values) if (schemaCols is not None): newdf = pd.DataFrame() for columnName in schemaCols: if columnName not in df.columns: newdf[columnName] = 0 else: newdf[columnName] = df[columnName].values df = newdf df = df.sort_index(axis=1) # standardize datasets prediction and training to use the same code from there on if 'Income in EUR' not in df.columns: df['Income in EUR'] = np.zeros(df.values.shape[0]) if 'Income' in df.columns: df.drop('Income') X = df.drop('Income in EUR', axis=1).values Y = df['Income in EUR'].values print('Shape - ', df.shape) global featSel if featSel is None: print('k = ? ') featSel = SelectKBest(f_regression, k=10) featSel.fit(X, Y) X = featSel.transform(X) print('Shape after feature selection - ', X.shape) return instances, X, Y, df.columns
class LightGbmOneFold(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_label = None self.__categorical_columns = None self.__encoder = None # model fit self.__folds = None self.__train_preds = None self.__test_preds = None self.__gbm = None def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path, "sample_submission.csv")) # selected feature self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop( ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1) self.__test_feature = self.__test[self.__train_feature.columns.tolist()] self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.loc[:, self.__categorical_columns], self.__train_label) self.__train_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform(self.__train_feature.loc[:, self.__categorical_columns]) ) self.__test_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform(self.__test_feature.loc[:, self.__categorical_columns]) ) def model_fit(self): feature_importance_df = pd.DataFrame() self.__gbm = LGBMClassifier( n_estimators=5000, learning_rate=0.0128, max_depth=8, num_leaves=11, min_split_gain=0.0018, min_child_weight=2.6880, colsample_bytree=0.5672, subsample=0.6406, reg_alpha=3.5025, reg_lambda=0.9549, n_jobs=-1 ) self.__gbm.fit(self.__train_feature, self.__train_label, verbose=True) self.__train_preds = self.__gbm.predict_proba(self.__train_feature)[:, 1] self.__test_preds = self.__gbm.predict_proba(self.__test_feature)[:, 1] feature_importance_df["feature"] = pd.Series(self.__train_feature.columns) feature_importance_df["importance"] = self.__gbm.feature_importances_ feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False) print("Train AUC score %.6f" % roc_auc_score(self.__train_label, self.__train_preds)) def model_predict(self): self.__sample_submission["TARGET"] = self.__test_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
#training['Hair Color'] = training['Hair Color'].replace( '0' ,'Nan_data') #training['Hair Color'] = training['Hair Color'].replace( 'Unknown' ,'Nan_data') #test['Hair Color'] = test['Hair Color'].replace( np.nan ,'Nan_data') #test['Hair Color'] = test['Hair Color'].replace( '0' ,'Nan_data') #test['Hair Color'] = test['Hair Color'].replace( 'Unknown' ,'Nan_data') X = training.iloc[:, :-1] y = training.iloc[:, -1] #Target encoding for categorical features. te = TargetEncoder() te.fit(X, y) X = te.transform(X) predict_dataset = te.transform(test) from sklearn.model_selection import train_test_split x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42) #from catboost import CatBoostRegressor # Using CatBoost #cat_model3 = CatBoostRegressor(iterations=125000) #cat_model3.fit(x_train, y_train)
import pandas as pd from category_encoders import TargetEncoder import joblib data = pd.read_csv('./京东万象数据填充2.csv', encoding='GBK') data = data.dropna(subset=['价格']) data = data.dropna(subset=['数据标签']) data = data.dropna(subset=['数据名称']) data = data.dropna(subset=['店铺']) enc = TargetEncoder(cols=['数据名称', '店铺', '数据标签']) # print(type(enc)) dataframe = data[['数据名称', '店铺', '数据标签', '数据大小', '浏览量', '价格']] enc.fit(dataframe, dataframe['价格']) data1 = enc.transform(dataframe) # print(type(data1)) # dataframe = pd.DataFrame({'数据名称': data1['数据名称'], '店铺': data1['店铺'], # '数据标签': data1['数据标签'], '数据大小': data1['数据大小'], # '浏览量': data1['浏览量'], '价格': data1['价格']}) joblib.dump(enc, 'encoding.joblib') data1.to_csv('final_data.csv', encoding='GBK', sep=',')
train_data_1=pd.concat([train_myVolts_Null_item_type, train_myVolts_Not_Null_item_type], axis=0) train_data_2=pd.concat([train_myVolts_Null_cbf_parser, train_myVolts_Not_Null_cbf_parser], axis=0) train_myVolts['item_type']=train_data_1['item_type'] train_myVolts['cbf_parser']=train_data_2['cbf_parser'] train_myVolts['country_by_ip']=train_myVolts['country_by_ip'].fillna('missing') print('Values with NANs Train',train_myVolts[feature_cols].isnull().sum()) y = train_myVolts.set_clicked X = train_myVolts[feature_cols] from category_encoders import TargetEncoder t1 = TargetEncoder() t1.fit(X, y) X = t1.transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1234) ##check X_train.to_csv('X_train4.csv',index=False) y_train.to_csv('y_train4.csv',index=False) from sklearn.linear_model import LogisticRegression #from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier logreg1 =LogisticRegression () # logreg1 = RandomForestClassifier(n_estimators=500) # for random forest is 0.9920 but logistic is 0.9922
class LightGbmKfold(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_feature_stacking_tree, self.__test_feature_stacking_tree = [ None for _ in range(2) ] self.__train_feature_stacking_linear, self.__test_feature_stacking_linear = [ None for _ in range(2) ] self.__train_feature_stacking_network, self.__test_feature_stacking_network = [ None for _ in range(2) ] self.__train_feature_stacking_gp, self.__test_feature_stacking_gp = [ None for _ in range(2) ] self.__train_label = None self.__categorical_columns = None self.__encoder = None # model fit self.__folds = None self.__oof_preds = None self.__sub_preds = None self.__gbm = None # self.__metric_weight = [] def data_prepare(self): self.__sample_submission = pd.read_csv( os.path.join(self.__input_path, "sample_submission.csv")) # selected feature self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) # stacking tree self.__train_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_train.csv")) self.__test_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_test.csv")) # stacking linear self.__train_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_train.csv")) self.__test_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_test.csv")) # stacking network self.__train_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_train.csv")) self.__test_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_test.csv")) # gp self.__train_feature_stacking_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_train_feature.csv")) self.__test_feature_stacking_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_test_feature.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit( self.__train_feature.loc[:, self.__categorical_columns], self.__train_label) self.__train_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature.loc[:, self.__categorical_columns])) self.__test_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform( self.__test_feature.loc[:, self.__categorical_columns])) self.__train_feature = pd.concat([ self.__train_feature, self.__train_feature_stacking_tree, self.__train_feature_stacking_linear, self.__train_feature_stacking_network, self.__train_feature_stacking_gp ], axis=1) self.__test_feature = pd.concat([ self.__test_feature, self.__test_feature_stacking_tree, self.__test_feature_stacking_linear, self.__test_feature_stacking_network, self.__test_feature_stacking_gp ], axis=1) def model_fit(self): self.__folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=8) self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0]) self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0]) # self.__sub_preds = np.zeros(shape=(self.__test_feature.shape[0], 5)) feature_importance_df = pd.DataFrame() for n_fold, (trn_idx, val_idx) in enumerate( self.__folds.split(self.__train_feature, self.__train_label)): trn_x, trn_y = self.__train_feature.iloc[ trn_idx], self.__train_label.iloc[trn_idx] val_x, val_y = self.__train_feature.iloc[ val_idx], self.__train_label.iloc[val_idx] self.__gbm = LGBMClassifier(colsample_bytree=0.6659, learning_rate=0.0197, max_depth=8, min_child_weight=1.0652, min_split_gain=0.058, n_estimators=501, num_leaves=11, reg_alpha=2.2487, reg_lambda=6.2587, subsample=0.9401) self.__gbm.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric="auc", verbose=True, early_stopping_rounds=5) pred_val = self.__gbm.predict_proba( val_x, num_iteration=self.__gbm.best_iteration_)[:, 1] pred_test = self.__gbm.predict_proba( self.__test_feature, num_iteration=self.__gbm.best_iteration_)[:, 1] self.__oof_preds[val_idx] = pred_val self.__sub_preds += pred_test / self.__folds.n_splits # self.__sub_preds[:, n_fold] = pred_test fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = pd.Series( self.__train_feature.columns) fold_importance_df["importance"] = self.__gbm.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) # 保存 weight # self.__metric_weight.append(roc_auc_score(val_y, self.__oof_preds[val_idx])) print( "Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx]))) feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False) print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds)) def model_predict(self): # weight sum # self.__metric_weight = pd.Series(self.__metric_weight).rank() # self.__metric_weight = self.__metric_weight / self.__metric_weight.sum() # self.__metric_weight = self.__metric_weight.values.reshape((5, 1)) # self.__sub_preds = np.dot(self.__sub_preds, self.__metric_weight) self.__sample_submission["TARGET"] = self.__sub_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
# SKLEARN TARGET ENCODING #!pip install category_encoders from category_encoders import TargetEncoder us_adults = pd.read_csv("./adult.csv", na_values="?") us_adults.head() features_original = [f for f in us_adults.columns if f not in "income"] features_original target_mapping #Remap outcome variable us_adults.loc[:, "income"] = us_adults.income.map(target_mapping) us_adults.income.value_counts() te = TargetEncoder(return_df=True, smoothing=0) te.fit(X=us_adults[features_original], y=us_adults.income) encoded_df_sk = te.transform(X=us_adults[features_original]) encoded_df_sk.shape encoded_df_sk.head()
class CatBoostKfold(object): def __init__(self, *, input_path_1, input_path_2, output_path): self.__input_path_1 = input_path_1 self.__input_path_2 = input_path_2 self.__output_path = output_path self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_res, self.__test_res = [None for _ in range(2)] self.__train_feature, self.__train_label = [None for _ in range(2)] self.__test_feature = None self.__categorical_index = None self.__encoder = None self.__numeric_index = None self.__folds = None self.__oof_preds = None self.__sub_preds = None self.__cat = None def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv")) self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv")) self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv")) self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv")) self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__test_feature = self.__test[self.__train_feature.columns] self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1) self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1) self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0] self.__train_feature.iloc[:, self.__categorical_index] = ( self.__train_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__test_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label) self.__train_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index]) ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index]) ) # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset" self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0] self.__train_feature.iloc[:, self.__numeric_index] = ( self.__train_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) self.__test_feature.iloc[:, self.__numeric_index] = ( self.__test_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label) def model_fit(self): self.__folds = StratifiedKFold(n_splits=5, shuffle=True) self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0]) self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)): trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx] val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx] self.__cat = CatBoostClassifier( iterations=6000, od_wait=200, od_type="Iter", eval_metric="AUC" ) self.__cat.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], use_best_model=True ) pred_val = self.__cat.predict_proba(val_x)[:, 1] pred_test = self.__cat.predict_proba(self.__test_feature)[:, 1] self.__oof_preds[val_idx] = pred_val self.__sub_preds += pred_test / self.__folds.n_splits print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx]))) print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds)) def model_predict(self): self.__sample_submission["TARGET"] = self.__sub_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
X_tst = X_test.copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) # with timer('weight of evidence'): # cat_cols = X_trn.select_dtypes(['object']).columns.tolist() # woe = WeightOfEvidence(cols=cat_cols, suffix='woe') # X_trn = pd.concat([X_trn, woe.fit_transform(X_trn.loc[:, cat_cols], y_trn)], axis=1) # X_val = pd.concat([X_val, woe.transform(X_val.loc[:, cat_cols])], axis=1) # X_tst = pd.concat([X_tst, woe.transform(X_tst.loc[:, cat_cols])], axis=1) with timer('target encoding'): cat_cols = X_trn.select_dtypes(['object']).columns.tolist() te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn) X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]) X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]) # with timer('calc sample weight'): # X_trn['is_test'] = 0 # X_tst['is_test'] = 1 # df = pd.concat([X_trn, X_tst]) # X = df.drop('is_test', axis=1) # y = df.is_test.ravel() # model = lgb.LGBMClassifier(**calc_weight_params) # model.fit(X, y) # proba = np.sqrt(rankdata(model.predict_proba(X)[:len(X_trn), 1])/len(X_trn)) # X_trn.drop('is_test', axis=1) # X_tst.drop('is_test', axis=1) with timer('fit'):
M = pd.read_csv('prediction_data.csv') M['Year of Record'] = simpleimputermedian.fit_transform( M['Year of Record'].values.reshape(-1, 1)) M['Age'] = simpleimputermedian.fit_transform(M['Age'].values.reshape(-1, 1)) M['Body Height [cm]'] = simpleimputermedian.fit_transform( M['Body Height [cm]'].values.reshape(-1, 1)) Mnoncateg = M.drop( ['Instance', 'Hair Color', 'Wears Glasses', 'Hair Color', 'Income'], axis=1) X = datasetnoncateg.drop('Income in EUR', axis=1).values Y = datasetnoncateg['Income in EUR'].values #target encoding t1 = TargetEncoder() t1.fit(X, Y) X = t1.transform(X) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33, random_state=0) # regressor = BayesianRidge() regressor = RandomForestRegressor() #regressor = AdaBoostRegressor() #regressor = = linear_model.SGDRegressor(max_iter=1000, tol=1e-3) fitResult = regressor.fit(Xtrain, Ytrain) YPredTest = regressor.predict(Xtest) #learningTest = pd.DataFrame({'Predicted': YPredTest, 'Actual': Ytest }) np.sqrt(metrics.mean_squared_error(Ytest, YPredTest))
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7, random_state=100) # Installing category_encoders to import Target Encoder !pip install category_encoders # Importing Target Encoder from category_encoders import TargetEncoder # creating an object "te" for Target Encoder te=TargetEncoder() # Fitting Target Encoder on X_train and y_train (Training Data) te.fit(X_train,y_train) #Transforming X_train (Training Data) X_train=te.transform(X_train) #Transforming X_test (Training Data) X_test=te.transform(X_test) #Importing Logistic Regression from sklearn from sklearn.linear_model import LogisticRegression # Creating object for Logistic Regression lr=LogisticRegression() #Fitting Logistic Regression on X_train annd y_train (Training Data) lr.fit(X_train,y_train) #Predicting X_test (Training Data) y_pred_train=lr.predict(X_test)
y = X.Income y = y - X['Additional_income'] X = X.drop('Income', 1) X = X.drop('Instance', 1) X = X.drop('Additional_income', 1) y1 = X1.Income y1 = y1 - X1['Additional_income'] X1 = X1.drop('Income', 1) X1 = X1.drop('Instance', 1) temp = X1['Additional_income'] X1 = X1.drop('Additional_income', 1) t1 = TargetEncoder() t1.fit(X, y) X = t1.transform(X) X1 = t1.transform(X1) mm_scaler = preprocessing.MinMaxScaler() X = mm_scaler.fit_transform(X) X1 = mm_scaler.transform(X1) from sklearn.model_selection import train_test_split Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.10, random_state=0) #from sklearn.linear_model import BayesianRidge #regressor = BayesianRidge() #reg = regressor.fit(X, y)
class BayesianOptimizationGoss(object): def __init__(self, *, input_path): self.__input_path = input_path # data prepare self.__train = None self.__train_label = None self.__train_feature = None self.__train_feature_stacking_tree = None self.__train_feature_stacking_linear = None self.__train_feature_stacking_network = None self.__train_feature_gp = None self.__encoder = None self.__categorical_columns = None # parameter tuning self.__gbm_bo = None self.__gbm_params = None self.__gp_params = {"alpha": 1e-4} def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__train_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_train.csv")) self.__train_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_train.csv")) self.__train_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_train.csv")) self.__train_feature_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_train_feature.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__encoder = TargetEncoder() self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) self.__train_feature = pd.concat([ self.__train_feature, self.__train_feature_stacking_tree, self.__train_feature_stacking_linear, self.__train_feature_stacking_network ], axis=1) def parameter_tuning(self): def __cv(drop_rate, max_drop, skip_drop, n_estimators, learning_rate, max_depth, num_leaves, min_split_gain, min_child_weight, colsample_bytree, subsample, reg_alpha, reg_lambda): val = cross_val_score( LGBMClassifier( boosting_type="dart", drop_rate=max(min(drop_rate, 1.0), 0), max_drop=max(round(max_drop), 1), skip_drop=max(min(skip_drop, 1.0), 0), n_estimators=max(round(n_estimators), 1), learning_rate=max(min(learning_rate, 1.0), 0), max_depth=max(round(max_depth), 1), num_leaves=(max( round(2 ^ round(max_depth) if num_leaves > 2 ^ round(max_depth) else round(num_leaves)), 1)), min_split_gain=max(min_split_gain, 0), min_child_weight=max(min_child_weight, 0), colsample_bytree=max(min(colsample_bytree, 1.0), 0), subsample=max(min(subsample, 1.0), 0), reg_alpha=max(reg_alpha, 0), reg_lambda=max(reg_lambda, 0), n_jobs=-1, verbose=-1), self.__train_feature, self.__train_label, scoring="roc_auc", # 要与使用 blending 的 lightgbm 相同 cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=8)).mean() return val self.__gbm_params = { # dart parameter "drop_rate": (0, 1.0), "max_drop": (10, 200), "skip_drop": (0, 1.0), # Gradient boosting parameter "n_estimators": (500, 3000), "learning_rate": (0.001, 0.1), # tree parameter "max_depth": (4, 10), "num_leaves": (10, 200), "min_split_gain": (0.00001, 0.1), "min_child_weight": (1, 100), # bagging parameter "colsample_bytree": (0.5, 1.0), "subsample": (0.5, 1.0), # reg parameter "reg_alpha": (0, 10), "reg_lambda": (0, 10) } self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params) self.__gbm_bo.maximize(init_points=30, n_iter=130, **self.__gp_params)
with timer('training'): cv_results = [] val_series = y_train.copy() test_df = pd.DataFrame() feat_df = pd.DataFrame(index=X_train.columns) for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx].copy() y_trn = y_train[trn_idx].copy() X_val = X_train.iloc[val_idx].copy() y_val = y_train[val_idx].copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): cat_cols = [f for f in X_trn.columns if X_trn[f].dtype=='object'] te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn).values X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]).values X_test_ = X_test.copy() X_test_.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]).values X_trn.fillna(-9999) X_val.fillna(-9999) X_test_.fillna(-9999) with timer('fit'): model = lgb.LGBMClassifier(**lgb_params) model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] val_series.iloc[val_idx] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_test_)[:, 1] feat_df[i] = model.feature_importances_
"D:\PythonProjects\ML_Group_Data/tcd-ml-comp-201920-income-pred-group/test.csv" ) train_data = preprocessing(train) test_data = preprocessing(test) y = train_data[target] train_data.drop(target, axis=1, inplace=True) test_data.drop(target, axis=1, inplace=True) enc = TargetEncoder(cols=[ 'Gender', 'Country', 'Profession', 'University Degree', 'Housing Situation', 'Satisfation with employer' ]) enc.fit(train_data, y) train_data = enc.transform(train_data) test_data = enc.transform(test_data) train_data.head() test_data.head() #X_Train, X_Test, y_train, y_test = train_test_split(train_data, y, test_size=0.3, random_state=1) X_Train = train_data y_train = y y_train_log = np.log(y_train) training = lgb.Dataset(X_Train, y_train_log) params = {} params['learning_rate'] = 0.003 params['boosting_type'] = 'gbdt' params['num_leaves'] = 140
with timer('training'): cv_results = [] val_series = y_train.copy() test_df = pd.DataFrame() feat_df = pd.DataFrame(index=X_train.columns) for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx] y_trn = y_train[trn_idx] X_val = X_train.iloc[val_idx] y_val = y_train[val_idx] print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): te = TargetEncoder() X_trn = te.fit_transform(X_trn, y_trn) X_val = te.transform(X_val) X_test_ = te.transform(X_test) X_trn.fillna(-9999) X_val.fillna(-9999) X_test_.fillna(-9999) with timer('fit'): model = lgb.LGBMClassifier(**lgb_params) model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] val_series.iloc[val_idx] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_test_)[:, 1] feat_df[i] = model.feature_importances_