def data_prepare(self): self.__train_feature_before = pd.read_csv( os.path.join(self.__input_path, "train_feature_before_df.csv")) self.__train_feature_after = pd.read_csv( os.path.join(self.__input_path, "train_feature_after_df.csv")) self.__train = pd.concat( [self.__train_feature_before, self.__train_feature_after]) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_feature_df.csv")) self.__train_label = self.__train["TARGET"].copy() self.__train_feature = (self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1)).copy() self.__test_feature = self.__test[ self.__train_feature.columns.tolist()].copy() self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() encoder = TargetEncoder() encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = encoder.transform( self.__train_feature[self.__categorical_columns])
def target_encoding(train, target, test=None, feat_to_encode=None, smooth=0.2, random_state=9527): print('Target encoding...') train.sort_index(inplace=True) target = train.pop(target) if feat_to_encode is None: feat_to_encode = train.columns.tolist() smoothing = smooth oof = pd.DataFrame([]) for tr_idx, oof_idx in StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True).split(train, target): ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train.iloc[tr_idx, :], target.iloc[tr_idx]) oof = oof.append(ce_target_encoder.transform(train.iloc[oof_idx, :]), ignore_index=False) ce_target_encoder = TargetEncoder(cols=feat_to_encode, smoothing=smoothing) ce_target_encoder.fit(train, target) train = oof.sort_index() if test is not None: test = ce_target_encoder.transform(test) features = list(train) print('Target encoding done!') return train, test, features, target
class DFMeanEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = TargetEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat( [new_X, self.model.transform(X[self.transform_cols])], axis=1) return new_X def fit_transform(self, X, y): return self.fit(X, y).transform(X)
class CategoricalPreprocessing(BaseEstimator, TransformerMixin): def __init__(self): self.mode_imputer = SimpleImputer(strategy="most_frequent") self.cat_cols = [ 'home_ownership', 'purpose', 'addr_state', 'initial_list_status' ] self.target_encoder = TargetEncoder(handle_missing='return_nan', handle_unknown='return_nan') def fit(self, X, y=None): self.mode_imputer.fit(X[self.cat_cols]) self.target_encoder.fit(X["zip_code"], y) return self def transform(self, X, y=None): Xc = X.copy() # encode emp_length lookup = { '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10 } Xc["emp_length"] = Xc["emp_length"].replace(lookup) # issue date Xc["issue_d"] = pd.to_datetime(Xc["issue_d"]) tmp = Xc[ "issue_d"].values # keep a copy of the raw date for when we transform earliest credit line Xc["issue_d"] = ( Xc["issue_d"] - datetime.datetime(2000, 1, 1)).astype('timedelta64[M]') # earliest credit line Xc["earliest_cr_line"] = pd.to_datetime(Xc["earliest_cr_line"]) Xc["earliest_cr_line"] = ( tmp - Xc["earliest_cr_line"]).astype('timedelta64[M]') # imputation for home_ownership, purpose, addr_state, and initial_list_status Xc[self.cat_cols] = self.mode_imputer.transform(Xc[self.cat_cols]) # encode zip code Xc["zip_code"] = self.target_encoder.transform(Xc["zip_code"]) return Xc def fit_transform(self, X, y=None): return self.fit(X, y).transform(X)
def categorical_encoding(df_X, y, cat_vars, id_train, method=None): if method is None: return df_X.values, df_X.columns target_enc = TargetEncoder(cols=cat_vars, drop_invariant=False, return_df=True, impute_missing=False, handle_unknown='error') target_enc.fit(df_X.iloc[id_train], pd.Series(y).iloc[id_train]) df_X = target_enc.transform(df_X) return df_X.values, df_X.columns
def frontend_preproc(df, y): ''' Function that produces the preprocessing of the DataFrame before applying the model on the front-end. :df: concat of df_input by the user and X features of the model :y: target ''' ### Feature Engineering ohe_cols = ['gearbox', 'fuel_type', 'warranty', 'dealer', 'doors'] # OHE ohe = OneHotEncoder(categories='auto') feature_arr = ohe.fit_transform(df[ohe_cols]).toarray() feature_labels = ohe.categories_ # Using a dictionary to produce all the new OHE columns feature_cols = [] for k, v in dict(zip(ohe_cols, feature_labels)).items(): for i in v: el = k + '_' + str(i) feature_cols.append(el) ohe_features = pd.DataFrame(feature_arr, columns=feature_cols) df = pd.concat([df, ohe_features], axis=1) df = df.drop(ohe_cols, axis=1) # Target Encoding cat_cols = df.select_dtypes(exclude=["number"]).columns cols_encoded = list(map(lambda c: c + '_encoded', cat_cols)) t_encoder = TargetEncoder() t_encoder.fit(df[1:][cat_cols], y) df[cols_encoded] = t_encoder.transform(df[cat_cols]) df = df.drop(cat_cols, axis=1) # Column Transformation: QuantileTransformer qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=33) data = qt.fit_transform(df) df = pd.DataFrame(data, columns=df.columns) return df
def target_encoding(X_train, y_train, X_test, cols, cv_id): cols = list(cols) train_new = X_train.copy() test_new = X_test.copy() test_new[:] = 0 cv = PredefinedSplit(cv_id) X_train.index = X_train.index.astype(int) for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()): enc = TargetEncoder(cols=cols) enc.fit(X_train.iloc[trn_idx], y_train[trn_idx]) train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx]) test_new += enc.transform(X_test) test_new /= cv.get_n_splits() train_new = train_new[cols] test_new = test_new[cols] train_new.columns = train_new.columns + '_target' test_new.columns = test_new.columns + '_target' print(list(train_new.columns)) return train_new, test_new
def clean_train_data_target_encoded(data): #uses target encodier instead data = data.reset_index(drop=True) train_y = data.iloc[:,-1] train_y = train_y.reset_index(drop=True) train_X = data.iloc[:,:-1] train_X = process_features(train_X) encoder = TargetEncoder(cols = ["Hair Color", "Wears Glasses","University Degree","Gender","Country","Profession", "Housing Situation", "Satisfation with employer"], smoothing = 300) encoder.fit(train_X,train_y) data2 = pd.concat([encoder.transform(train_X,train_y).reset_index(drop=True),train_y.reset_index(drop=True)],axis=1) #data2 = data2.fillna(method="ffill") return (data2,encoder)
class ScatterPlot(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path self.__train = None self.__train_feature, self.__train_label = [None for _ in range(2)] self.__encoder = None self.__pca, self.__t_sne = [None for _ in range(2)] def data_read(self): self.__train = pd.read_csv(os.path.join(self.__input_path, "train.csv")) self.__train = self.__train.drop(["id"], axis=1) self.__train_feature, self.__train_label = (self.__train.drop( ["target"], axis=1).copy(deep=True), self.__train["target"].copy(deep=True)) self.__train_feature = self.__train_feature.astype(str) def data_prepare(self): self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature, self.__train_label) self.__train_feature = self.__encoder.transform(self.__train_feature) self.__pca = PCA(n_components=2, random_state=7) self.__train_feature = self.__pca.fit_transform(self.__train_feature) self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"]) # self.__t_sne = TSNE(verbose=True, random_state=7) # self.__train_feature = self.__t_sne.fit_transform(self.__train_feature) # self.__train_feature = pd.DataFrame(self.__train_feature, columns=["col_1", "col_2"]) def scatter_plot(self): _, ax = plt.subplots(figsize=(16, 9)) ax = sns.scatterplot(x="col_1", y="col_2", hue=self.__train_label, data=self.__train_feature, ax=ax) ax.get_figure().savefig(os.path.join(self.__output_path, "PCA.png"))
def fit_model(X_train, y_train, X_val, y_val, **params): if args.model == "catboost": if args.gpu: model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True, task_type="GPU") else: model = CatBoostRegressor(**params, loss_function="RMSE", random_state=42, use_best_model=True, task_type="CPU") model.fit(X_train, y_train, cat_features=cat_cols, early_stopping_rounds=config.EARLY_STOPPING_ROUNDS, eval_set=(X_val, y_val), plot=False) return model, None elif args.model == "xgboost": te = TargetEncoder(cols=cat_cols, smoothing=300) te.fit(X_train, y_train) X_train = te.transform(X_train) X_val = te.transform(X_val) if args.gpu: model = XGBRegressor(**params, random_state=42, verbosity=1, tree_method='gpu_hist', gpu_id=0, predictor="cpu_predictor") else: model = XGBRegressor(**params, random_state=42, verbosity=1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric="rmse", early_stopping_rounds=config.EARLY_STOPPING_ROUNDS, verbose=True) return model, te else: raise ValueError("Invalid value passed to model. Has to be either CatBoost or XGBoost.")
train_myVolts_Null_cbf_parser['cbf_parser']=train_myVolts_Null_cbf_parser.query_char_count.apply(lambda x: cbf_parser_estimator(x)) train_data_1=pd.concat([train_myVolts_Null_item_type, train_myVolts_Not_Null_item_type], axis=0) train_data_2=pd.concat([train_myVolts_Null_cbf_parser, train_myVolts_Not_Null_cbf_parser], axis=0) train_myVolts['item_type']=train_data_1['item_type'] train_myVolts['cbf_parser']=train_data_2['cbf_parser'] train_myVolts['country_by_ip']=train_myVolts['country_by_ip'].fillna('missing') print('Values with NANs Train',train_myVolts[feature_cols].isnull().sum()) y = train_myVolts.set_clicked X = train_myVolts[feature_cols] from category_encoders import TargetEncoder t1 = TargetEncoder() t1.fit(X, y) X = t1.transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1234) ##check X_train.to_csv('X_train4.csv',index=False) y_train.to_csv('y_train4.csv',index=False) from sklearn.linear_model import LogisticRegression #from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier logreg1 =LogisticRegression () # logreg1 = RandomForestClassifier(n_estimators=500)
# SKLEARN TARGET ENCODING #!pip install category_encoders from category_encoders import TargetEncoder us_adults = pd.read_csv("./adult.csv", na_values="?") us_adults.head() features_original = [f for f in us_adults.columns if f not in "income"] features_original target_mapping #Remap outcome variable us_adults.loc[:, "income"] = us_adults.income.map(target_mapping) us_adults.income.value_counts() te = TargetEncoder(return_df=True, smoothing=0) te.fit(X=us_adults[features_original], y=us_adults.income) encoded_df_sk = te.transform(X=us_adults[features_original]) encoded_df_sk.shape encoded_df_sk.head()
class CatBoostKfold(object): def __init__(self, *, input_path_1, input_path_2, output_path): self.__input_path_1 = input_path_1 self.__input_path_2 = input_path_2 self.__output_path = output_path self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_res, self.__test_res = [None for _ in range(2)] self.__train_feature, self.__train_label = [None for _ in range(2)] self.__test_feature = None self.__categorical_index = None self.__encoder = None self.__numeric_index = None self.__folds = None self.__oof_preds = None self.__sub_preds = None self.__cat = None def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv")) self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv")) self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv")) self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv")) self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__test_feature = self.__test[self.__train_feature.columns] self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1) self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1) self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1) self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0] self.__train_feature.iloc[:, self.__categorical_index] = ( self.__train_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__test_feature.iloc[:, self.__categorical_index].fillna("missing") ) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label) self.__train_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index]) ) self.__test_feature.iloc[:, self.__categorical_index] = ( self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index]) ) # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset" self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0] self.__train_feature.iloc[:, self.__numeric_index] = ( self.__train_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) self.__test_feature.iloc[:, self.__numeric_index] = ( self.__test_feature.iloc[:, self.__numeric_index].apply( lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0) ) ) # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label) def model_fit(self): self.__folds = StratifiedKFold(n_splits=5, shuffle=True) self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0]) self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)): trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx] val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx] self.__cat = CatBoostClassifier( iterations=6000, od_wait=200, od_type="Iter", eval_metric="AUC" ) self.__cat.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], use_best_model=True ) pred_val = self.__cat.predict_proba(val_x)[:, 1] pred_test = self.__cat.predict_proba(self.__test_feature)[:, 1] self.__oof_preds[val_idx] = pred_val self.__sub_preds += pred_test / self.__folds.n_splits print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx]))) print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds)) def model_predict(self): self.__sample_submission["TARGET"] = self.__sub_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
class BayesianOptimizationGoss(object): def __init__(self, *, input_path): self.__input_path = input_path # data prepare self.__train = None self.__train_label = None self.__train_feature = None self.__train_feature_stacking_tree = None self.__train_feature_stacking_linear = None self.__train_feature_stacking_network = None self.__train_feature_gp = None self.__encoder = None self.__categorical_columns = None # parameter tuning self.__gbm_bo = None self.__gbm_params = None self.__gp_params = {"alpha": 1e-4} def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__train_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_train.csv")) self.__train_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_train.csv")) self.__train_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_train.csv")) self.__train_feature_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_train_feature.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__encoder = TargetEncoder() self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) self.__train_feature = pd.concat([ self.__train_feature, self.__train_feature_stacking_tree, self.__train_feature_stacking_linear, self.__train_feature_stacking_network ], axis=1) def parameter_tuning(self): def __cv(drop_rate, max_drop, skip_drop, n_estimators, learning_rate, max_depth, num_leaves, min_split_gain, min_child_weight, colsample_bytree, subsample, reg_alpha, reg_lambda): val = cross_val_score( LGBMClassifier( boosting_type="dart", drop_rate=max(min(drop_rate, 1.0), 0), max_drop=max(round(max_drop), 1), skip_drop=max(min(skip_drop, 1.0), 0), n_estimators=max(round(n_estimators), 1), learning_rate=max(min(learning_rate, 1.0), 0), max_depth=max(round(max_depth), 1), num_leaves=(max( round(2 ^ round(max_depth) if num_leaves > 2 ^ round(max_depth) else round(num_leaves)), 1)), min_split_gain=max(min_split_gain, 0), min_child_weight=max(min_child_weight, 0), colsample_bytree=max(min(colsample_bytree, 1.0), 0), subsample=max(min(subsample, 1.0), 0), reg_alpha=max(reg_alpha, 0), reg_lambda=max(reg_lambda, 0), n_jobs=-1, verbose=-1), self.__train_feature, self.__train_label, scoring="roc_auc", # 要与使用 blending 的 lightgbm 相同 cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=8)).mean() return val self.__gbm_params = { # dart parameter "drop_rate": (0, 1.0), "max_drop": (10, 200), "skip_drop": (0, 1.0), # Gradient boosting parameter "n_estimators": (500, 3000), "learning_rate": (0.001, 0.1), # tree parameter "max_depth": (4, 10), "num_leaves": (10, 200), "min_split_gain": (0.00001, 0.1), "min_child_weight": (1, 100), # bagging parameter "colsample_bytree": (0.5, 1.0), "subsample": (0.5, 1.0), # reg parameter "reg_alpha": (0, 10), "reg_lambda": (0, 10) } self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params) self.__gbm_bo.maximize(init_points=30, n_iter=130, **self.__gp_params)
M = pd.read_csv('prediction_data.csv') M['Year of Record'] = simpleimputermedian.fit_transform( M['Year of Record'].values.reshape(-1, 1)) M['Age'] = simpleimputermedian.fit_transform(M['Age'].values.reshape(-1, 1)) M['Body Height [cm]'] = simpleimputermedian.fit_transform( M['Body Height [cm]'].values.reshape(-1, 1)) Mnoncateg = M.drop( ['Instance', 'Hair Color', 'Wears Glasses', 'Hair Color', 'Income'], axis=1) X = datasetnoncateg.drop('Income in EUR', axis=1).values Y = datasetnoncateg['Income in EUR'].values #target encoding t1 = TargetEncoder() t1.fit(X, Y) X = t1.transform(X) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33, random_state=0) # regressor = BayesianRidge() regressor = RandomForestRegressor() #regressor = AdaBoostRegressor() #regressor = = linear_model.SGDRegressor(max_iter=1000, tol=1e-3) fitResult = regressor.fit(Xtrain, Ytrain) YPredTest = regressor.predict(Xtest) #learningTest = pd.DataFrame({'Predicted': YPredTest, 'Actual': Ytest }) np.sqrt(metrics.mean_squared_error(Ytest, YPredTest))
class BayesianOptimizationGbdt(object): def __init__(self, *, input_path): self.__input_path = input_path # data prepare self.__train = None self.__train_label = None self.__train_feature = None self.__encoder = None self.__categorical_columns = None # parameter tuning self.__gbm_bo = None self.__gbm_params = None self.__gp_params = {"alpha": 1e-4} def data_prepare(self): self.__train = pd.read_csv(os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop( ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1) self.__encoder = TargetEncoder() self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform(self.__train_feature[self.__categorical_columns]) ) def parameter_tuning(self): def __cv( n_estimators, learning_rate, max_depth, num_leaves, min_split_gain, min_child_weight, colsample_bytree, subsample, reg_alpha, reg_lambda): val = cross_val_score( LGBMClassifier( n_estimators=max(int(round(n_estimators)), 1), learning_rate=max(min(learning_rate, 1.0), 0), max_depth=max(int(round(max_depth)), 1), # 如果 num_leaves > 2 ^ round(max_depth) 时 leaf-wise 的树就会太深导致 overfitting num_leaves=max(2 ^ int(round(max_depth)) if num_leaves > 2 ^ int(round(max_depth)) else int(round(num_leaves)), 1), min_split_gain=max(min_split_gain, 0), min_child_weight=max(min_child_weight, 0), colsample_bytree=max(min(colsample_bytree, 1.0), 0), subsample=max(min(subsample, 1.0), 0), reg_alpha=max(reg_alpha, 0), reg_lambda=max(reg_lambda, 0), n_jobs=-1, verbose=-1 ), self.__train_feature, self.__train_label, scoring="roc_auc", cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=7) ).mean() return val self.__gbm_params = { # Gradient boosting parameter "n_estimators": (5500, 6500), "learning_rate": (0.001, 0.03), # tree parameter "max_depth": (4, 10), "num_leaves": (10, 200), "min_split_gain": (0.00001, 0.1), "min_child_weight": (1, 100), # bagging parameter "colsample_bytree": (0.5, 1.0), "subsample": (0.5, 1.0), # reg parameter "reg_alpha": (0, 10), "reg_lambda": (0, 10) } self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params) self.__gbm_bo.maximize(** self.__gp_params)
def train(MODEL="GNB"): # load voter data and merge with Census data df = pd.read_csv(DIR + "/data/nc_voter_geocoded_census_block_trigrams.csv") df = prep_data(df) tes = {} #tes = joblib.load(DIR + "/data/models/transformers_binary.joblib") models = {} # Loop through each race class, create model for each for race in ["W", "B", "A", "I", "HL"]: X = df.copy() # If hispanic, use ethnic_code instead of race code if race == "HL": X["ethnic_code"] = np.where(X["ethnic_code"] == race, True, False) y = X["ethnic_code"] # other wise race code else: X["race_code"] = np.where(X["race_code"] == race, True, False) y = X["race_code"] # target encode names, save target encoder for col in ["first_name", "last_name", "middle_name"]: #te = tes[race][col] te = TargetEncoder() te.fit(X[col], y) X[col] = te.transform(X[col]) # remove target variables and fill in any nas with 0 #sample_weights = X["sample_weights"] #X = X.drop(["race_code", "ethnic_code", "zip", "sample_weights"], axis=1) X = X.fillna(0) sm = SMOTE(n_jobs=-1) X, y = sm.fit_resample(X, y) sample_weights = X["sample_weights"] X = X.drop(["zip", "sample_weights"], axis=1) # train model if MODEL == "LGBM": from lightgbm import LGBMClassifier model = LGBMClassifier(n_jobs=-1) elif MODEL == "GNB": from sklearn.naive_bayes import GaussianNB model = GaussianNB() elif MODEL == "XGB": from xgboost import XGBClassifier model = XGBClassifier(n_jobs=-1) elif MODEL == "SGD": model = SGDClassifier(alpha=0.0, eta0=0.1, fit_intercept=True, l1_ratio=1.0, learning_rate="constant", loss="modified_huber", penalty="elasticnet", power_t=0.0) elif MODEL == "RF": from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_jobs=-1, max_depth=10) model.fit(X[MODEL_COLS], y, sample_weight=sample_weights) # save model models[race] = model # score model print(race, model.score(X[MODEL_COLS], y)) # Save the models and encoders handle = MODEL.lower() #joblib.dump(tes, DIR + "/data/models/transformers_binary.joblib", compress=True) joblib.dump(models, DIR + "/data/models/models_binary_%s.joblib" % handle, compress=True) #joblib.dump(scalers, DIR + "/data/models/scalers_binary.joblib", compress=True) print("Trained model saved to ./data/models/")
class FeatureSelectionUseVariance(object): def __init__(self, *, input_path, output_path): self.__input_path = input_path self.__output_path = output_path self.__train, self.__test = [None for _ in range(2)] self.__train_label = None self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__categorical_columns = None self.__encoder = None self.__remove_feature = [] def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # 使用这种方式无法报错, 可以换一种方法, 删除相似度为 1 的特征 # drop duplicate column # self.__train_feature = self.__train_feature.T.drop_duplicates().T # self.__test_feature = self.__test_feature[self.__train_feature.columns.tolist()] # encoder self.__categorical_columns = (self.__train_feature.select_dtypes( include="object").columns.tolist()) self.__train_feature[self.__categorical_columns] = ( self.__train_feature[self.__categorical_columns].fillna("missing")) self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) for col in self.__train_feature.columns.tolist(): if self.__train_feature[col].std() == 0.: print(col) self.__remove_feature.append(col) def data_output(self): self.__train[[ col for col in self.__train.columns.tolist() if col not in self.__remove_feature ]].to_csv(os.path.join(self.__output_path, "train_select_feature_df.csv"), index=False) self.__test[[ col for col in self.__test.columns.tolist() if col not in self.__remove_feature ]].to_csv(os.path.join(self.__output_path, "test_select_feature_df.csv"), index=False)
class LightGbmOneFold(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_label = None self.__categorical_columns = None self.__encoder = None # model fit self.__folds = None self.__train_preds = None self.__test_preds = None self.__gbm = None def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path, "sample_submission.csv")) # selected feature self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop( ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1) self.__test_feature = self.__test[self.__train_feature.columns.tolist()] self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.loc[:, self.__categorical_columns], self.__train_label) self.__train_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform(self.__train_feature.loc[:, self.__categorical_columns]) ) self.__test_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform(self.__test_feature.loc[:, self.__categorical_columns]) ) def model_fit(self): feature_importance_df = pd.DataFrame() self.__gbm = LGBMClassifier( n_estimators=5000, learning_rate=0.0128, max_depth=8, num_leaves=11, min_split_gain=0.0018, min_child_weight=2.6880, colsample_bytree=0.5672, subsample=0.6406, reg_alpha=3.5025, reg_lambda=0.9549, n_jobs=-1 ) self.__gbm.fit(self.__train_feature, self.__train_label, verbose=True) self.__train_preds = self.__gbm.predict_proba(self.__train_feature)[:, 1] self.__test_preds = self.__gbm.predict_proba(self.__test_feature)[:, 1] feature_importance_df["feature"] = pd.Series(self.__train_feature.columns) feature_importance_df["importance"] = self.__gbm.feature_importances_ feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False) print("Train AUC score %.6f" % roc_auc_score(self.__train_label, self.__train_preds)) def model_predict(self): self.__sample_submission["TARGET"] = self.__test_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
class ROHE(BaseEstimator, TransformerMixin): def __init__(self): self.__columns = None self.__missing = None self.__categories = None self.__lab_encoder = None self.__tar_encoder = None self.__ohe_encoder = None def fit(self, X, y=None): feature, label = X.copy(deep=True), y.copy(deep=True) del X, y gc.collect() self.__columns = list() self.__missing = dict() self.__categories = dict() self.__lab_encoder = dict() for column in feature.columns: num_unique = feature[column].nunique() if num_unique == 1: continue else: self.__columns.append(column) if feature[column].isna().sum(): self.__missing[column] = "missing" self.__categories[column] = feature[column].unique() else: self.__missing[column] = feature[column].value_counts( ascending=True).index[0] self.__categories[column] = feature[column].unique() encoder = LabelEncoder() encoder.fit(feature[column]) feature[column] = encoder.transform(feature[column]) self.__lab_encoder[column] = encoder feature = feature[self.__columns].copy(deep=True) self.__tar_encoder = TargetEncoder() self.__tar_encoder.fit(feature.astype(str), label) self.__ohe_encoder = OneHotEncoder(categories="auto", sparse=True) # drop="first" bad self.__ohe_encoder.fit( self.__tar_encoder.transform(feature.astype(str))) def transform(self, X): feature = X.copy(deep=True) del X gc.collect() feature = feature[self.__columns].copy(deep=True) for column in feature.columns: feature[column] = feature[column].fillna(self.__missing[column]) feature[column] = feature[column].apply( lambda element: element if element in self.__categories[ column] else self.__missing[column]) feature[column] = self.__lab_encoder[column].transform( feature[column]) return self.__ohe_encoder.transform( self.__tar_encoder.transform(feature.astype(str))) def fit_transform(self, X, y=None, **fit_params): feature, label = X.copy(deep=True), y.copy(deep=True) del X, y gc.collect() self.fit(feature, label) return self.transform(feature)
class LightGbmKfold(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_label = None self.__categorical_columns = None self.__encoder = None # model fit self.__folds = None self.__oof_preds = None self.__sub_preds = None self.__gbm = None def data_prepare(self): self.__sample_submission = pd.read_csv(os.path.join(self.__input_path, "sample_submission.csv")) self.__train = pd.read_csv(os.path.join(self.__input_path, "train_feature_df.csv")) self.__test = pd.read_csv(os.path.join(self.__input_path, "test_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop( ["TARGET"] + [col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col)], axis=1) self.__test_feature = self.__test[self.__train_feature.columns.tolist()] self.__categorical_columns = self.__train_feature.select_dtypes("object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature.loc[:, self.__categorical_columns], self.__train_label) self.__train_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform(self.__train_feature.loc[:, self.__categorical_columns]) ) self.__test_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform(self.__test_feature.loc[:, self.__categorical_columns]) ) def model_fit(self): self.__folds = StratifiedKFold(n_splits=5, shuffle=True) self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0]) self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0]) feature_importance_df = pd.DataFrame() for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)): trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx] val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx] self.__gbm = LGBMClassifier( n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775 ) self.__gbm.fit( trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric="auc", verbose=True, early_stopping_rounds=200 ) pred_val = self.__gbm.predict_proba(val_x, num_iteration=self.__gbm.best_iteration_)[:, 1] pred_test = self.__gbm.predict_proba(self.__test_feature, num_iteration=self.__gbm.best_iteration_)[:, 1] self.__oof_preds[val_idx] = pred_val self.__sub_preds += pred_test / self.__folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = pd.Series(self.__train_feature.columns) fold_importance_df["importance"] = self.__gbm.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx]))) feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False) print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds)) def model_predict(self): self.__sample_submission["TARGET"] = self.__sub_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
import pandas as pd from category_encoders import TargetEncoder import joblib data = pd.read_csv('./京东万象数据填充2.csv', encoding='GBK') data = data.dropna(subset=['价格']) data = data.dropna(subset=['数据标签']) data = data.dropna(subset=['数据名称']) data = data.dropna(subset=['店铺']) enc = TargetEncoder(cols=['数据名称', '店铺', '数据标签']) # print(type(enc)) dataframe = data[['数据名称', '店铺', '数据标签', '数据大小', '浏览量', '价格']] enc.fit(dataframe, dataframe['价格']) data1 = enc.transform(dataframe) # print(type(data1)) # dataframe = pd.DataFrame({'数据名称': data1['数据名称'], '店铺': data1['店铺'], # '数据标签': data1['数据标签'], '数据大小': data1['数据大小'], # '浏览量': data1['浏览量'], '价格': data1['价格']}) joblib.dump(enc, 'encoding.joblib') data1.to_csv('final_data.csv', encoding='GBK', sep=',')
#training['Hair Color'] = training['Hair Color'].replace( np.nan ,'Nan_data') #training['Hair Color'] = training['Hair Color'].replace( '0' ,'Nan_data') #training['Hair Color'] = training['Hair Color'].replace( 'Unknown' ,'Nan_data') #test['Hair Color'] = test['Hair Color'].replace( np.nan ,'Nan_data') #test['Hair Color'] = test['Hair Color'].replace( '0' ,'Nan_data') #test['Hair Color'] = test['Hair Color'].replace( 'Unknown' ,'Nan_data') X = training.iloc[:, :-1] y = training.iloc[:, -1] #Target encoding for categorical features. te = TargetEncoder() te.fit(X, y) X = te.transform(X) predict_dataset = te.transform(test) from sklearn.model_selection import train_test_split x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42) #from catboost import CatBoostRegressor # Using CatBoost #cat_model3 = CatBoostRegressor(iterations=125000)
class BayesianOptimizationGoss(object): def __init__(self, *, input_path): self.__input_path = input_path # data prepare self.__train = None self.__train_label = None self.__train_feature = None self.__encoder = None self.__categorical_columns = None # parameter tuning self.__gbm_bo = None self.__gbm_params = None self.__gp_params = {"alpha": 1e-3} def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__encoder = TargetEncoder() self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature[self.__categorical_columns])) self.__train_feature, self.__train_label = shuffle( self.__train_feature, self.__train_label) def parameter_tuning(self): def __cv(top_rate, other_rate, n_estimators, learning_rate, max_depth, num_leaves, min_split_gain, min_child_weight, colsample_bytree, reg_alpha, reg_lambda): val = cross_val_score( LGBMClassifier( boosting_type="goss", top_rate=max(min(top_rate, 1.0), 0), other_rate=max(min(1.0 - top_rate, 1.0), 0), n_estimators=max(int(n_estimators), 1), learning_rate=max(min(learning_rate, 1.0), 0), max_depth=max( int(2 ^ int(max_depth) if num_leaves > 2 ^ int(max_depth) else int(num_leaves)), 1), num_leaves=max(int(num_leaves), 1), min_split_gain=max(min_split_gain, 0), min_child_weight=max(min_child_weight, 0), colsample_bytree=max(min(colsample_bytree, 1.0), 0), # subsample=max(min(subsample, 1.0), 0), reg_alpha=max(reg_alpha, 0), reg_lambda=max(reg_lambda, 0), n_jobs=4, verbose=-1), self.__train_feature, self.__train_label, scoring="roc_auc", cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=7)).mean() return val self.__gbm_params = { # GOSS top_rate + other_rate = 1 "top_rate": (0.01, 1), "other_rate": (0.01, 1), # Gradient boosting parameter "n_estimators": (1000, 4000), "learning_rate": (0.001, 0.1), # tree parameter "max_depth": (4, 10), "num_leaves": (10, 200), "min_split_gain": (0.00001, 0.1), "min_child_weight": (1, 100), # bagging parameter "colsample_bytree": (0, 0.999), # "subsample": (0, 0.999), # reg parameter "reg_alpha": (0, 10), "reg_lambda": (0, 10) } self.__gbm_bo = BayesianOptimization(__cv, self.__gbm_params) self.__gbm_bo.maximize(init_points=10, n_iter=50, kappa=2.576 * 2, **self.__gp_params) print(self.__gbm_bo.res["max"]["max_val"]) print(self.__gbm_bo.res["max"]["max_params"]["top_rate"]) print(self.__gbm_bo.res["max"]["max_params"]["other_rate"]) print(self.__gbm_bo.res["max"]["max_params"]["n_estimators"]) print(self.__gbm_bo.res["max"]["max_params"]["learning_rate"]) print(self.__gbm_bo.res["max"]["max_params"]["max_depth"]) print(self.__gbm_bo.res["max"]["max_params"]["num_leaves"]) print(self.__gbm_bo.res["max"]["max_params"]["min_split_gain"]) print(self.__gbm_bo.res["max"]["max_params"]["min_child_weight"]) print(self.__gbm_bo.res["max"]["max_params"]["colsample_bytree"]) # print(self.__gbm_bo.res["max"]["max_params"]["subsample"]) print(self.__gbm_bo.res["max"]["max_params"]["reg_alpha"]) print(self.__gbm_bo.res["max"]["max_params"]["reg_lambda"])
def ProcessRawData(df, schemaCols=None): medianSimpleImputer = SimpleImputer(strategy='median') standardScaler = preprocessing.StandardScaler() # Adding extra features AgeLog and HeightLog df['AgeLog'] = np.log(df['Age'].values) df['HeightLog'] = np.log(df['Body Height [cm]'].values) # Fill missing values df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']] = medianSimpleImputer.fit_transform(df[['Year of Record', 'Age', 'AgeLog', 'HeightLog']].values) # Scale numeric columns 1 df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']] = standardScaler.fit_transform(df[['Year of Record', 'Size of City', 'Body Height [cm]', 'Age', 'AgeLog']].values) # Scale numeric columns 2 if 'Income in EUR' in df.columns: global YScaler YScaler = preprocessing.StandardScaler() df[['Income in EUR']] = YScaler.fit_transform(df[['Income in EUR']].values) # Reducing complexity of features df.Profession = list(df.Profession.map(S2)) # To be used while writing results to CSV instances = df['Instance'].values df = df.drop(['Instance'], axis=1) print('Columns available 1 - ', df.columns) # Target encoding the data - could've been done with a single encoder object, will try later, if (schemaCols is None): # condition to skip fitting on Prediction dataset and only transform then global t1, t2, t3, t4, t5 t1 = TargetEncoder() t2 = TargetEncoder() t3 = TargetEncoder() t4 = TargetEncoder() t5 = TargetEncoder() t1.fit(df.Country.values, df['Income in EUR'].values) t2.fit(df.Profession.values, df['Income in EUR'].values) t3.fit(df.Gender.values, df['Income in EUR'].values) t4.fit(df['University Degree'].values, df['Income in EUR'].values) t5.fit(df['Hair Color'].values, df['Income in EUR'].values) df.Country = t1.transform(df.Country.values) df.Profession = t2.transform(df.Profession.values) df.Gender = t3.transform(df.Gender.values) df['University Degree'] = t4.transform(df['University Degree'].values) df['Hair Color'] = t5.transform(df['Hair Color'].values) if (schemaCols is not None): newdf = pd.DataFrame() for columnName in schemaCols: if columnName not in df.columns: newdf[columnName] = 0 else: newdf[columnName] = df[columnName].values df = newdf df = df.sort_index(axis=1) # standardize datasets prediction and training to use the same code from there on if 'Income in EUR' not in df.columns: df['Income in EUR'] = np.zeros(df.values.shape[0]) if 'Income' in df.columns: df.drop('Income') X = df.drop('Income in EUR', axis=1).values Y = df['Income in EUR'].values print('Shape - ', df.shape) global featSel if featSel is None: print('k = ? ') featSel = SelectKBest(f_regression, k=10) featSel.fit(X, Y) X = featSel.transform(X) print('Shape after feature selection - ', X.shape) return instances, X, Y, df.columns
class StackingFirstLayerLinear(object): def __init__(self, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path self.__skwp = importlib.import_module("SklWrapper") # data prepare self.__train, self.__test = [None for _ in range(2)] self.__train_feature, self.__train_label = [None for _ in range(2)] self.__test_feature = None self.__categorical_index = None self.__numeric_index = None # filler encoder scaler self.__filler, self.__encoder, self.__scaler = [None for _ in range(3)] self.__oof_train, self.__oof_test = [None for _ in range(2)] self.__first_layer_train, self.__first_layer_test = [ None for _ in range(2) ] # model fit def data_prepare(self): self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # drop column na self.__train_feature = self.__train_feature[list( (self.__train_feature.isna().sum() / self.__train_feature.isna().count() )[(self.__train_feature.isna().sum() / self.__train_feature.isna().count()) < 0.2].index)] self.__test_feature = self.__test_feature[ self.__train_feature.columns.tolist()] # columns 而不是 index self.__categorical_index = self.__train_feature.select_dtypes( include="object").columns.tolist() self.__numeric_index = self.__train_feature.select_dtypes( exclude="object").columns.tolist() # filler Imputer all np.nan remove column self.__filler = Imputer(strategy="median") self.__filler.fit(self.__train_feature[self.__numeric_index]) self.__train_feature[self.__numeric_index] = self.__filler.transform( self.__train_feature[self.__numeric_index]) self.__test_feature[self.__numeric_index] = self.__filler.transform( self.__test_feature[self.__numeric_index]) # encoder self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_index], self.__train_label) self.__train_feature[ self.__categorical_index] = self.__encoder.transform( self.__train_feature[self.__categorical_index]) self.__test_feature[ self.__categorical_index] = self.__encoder.transform( self.__test_feature[self.__categorical_index]) # scaler pandas in numpy out self.__scaler = MinMaxScaler() self.__scaler.fit(self.__train_feature) self.__train_feature = pd.DataFrame( self.__scaler.transform(self.__train_feature), columns=self.__train_feature.columns) self.__test_feature = pd.DataFrame(self.__scaler.transform( self.__test_feature), columns=self.__test_feature.columns) def model_fit(self): def __get_oof(clf, train_feature, train_label, test_feature): folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=7) oof_train = np.zeros(shape=train_feature.shape[0]) oof_test = np.zeros(shape=test_feature.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate( folds.split(train_feature, train_label)): trn_x, trn_y = train_feature.iloc[trn_idx], train_label.iloc[ trn_idx] val_x, val_y = train_feature.iloc[val_idx], train_label.iloc[ val_idx] clf.train(trn_x, trn_y) pred_val = clf.predict(val_x) pred_test = clf.predict(test_feature) oof_train[val_idx] = pred_val oof_test += pred_test / folds.n_splits return oof_train.reshape((-1, 1)), oof_test.reshape((-1, 1)) lr_p1 = self.__skwp.SklWrapper(clf=LogisticRegression, init_params={"penalty": "l1"}) lr_p2 = self.__skwp.SklWrapper(clf=LogisticRegression, init_params={"penalty": "l2"}) mlp_unit_100 = self.__skwp.SklWrapper( clf=MLPClassifier, init_params={"hidden_layer_sizes": (100, )}) mlp_unit_200 = self.__skwp.SklWrapper( clf=MLPClassifier, init_params={"hidden_layer_sizes": (200, )}) mlp_unit_300 = self.__skwp.SklWrapper( clf=MLPClassifier, init_params={"hidden_layer_sizes": (300, )}) # mlp_unit_5_100 = self.__skwp.SklWrapper( # clf=MLPClassifier, # init_params={ # "hidden_layer_sizes": (100, 100, 100, 100, 100) # } # ) # mlp_unit_5_300 = self.__skwp.SklWrapper( # clf=MLPClassifier, # init_params={ # "hidden_layer_sizes": (300, 300, 300, 300, 300) # } # ) # mlp_unit_5_900 = self.__skwp.SklWrapper( # clf=MLPClassifier, # init_params={ # "hidden_layer_sizes": (900, 900, 900, 900, 900) # } # ) lr_p1_oof_train, lr_p1_oof_test = __get_oof(lr_p1, self.__train_feature, self.__train_label, self.__test_feature) print("lr l1 oof complete !") lr_p2_oof_train, lr_p2_oof_test = __get_oof(lr_p2, self.__train_feature, self.__train_label, self.__test_feature) print("lr l2 oof complete !") mlp_unit_100_oof_train, mlp_unit_100_oof_test = __get_oof( mlp_unit_100, self.__train_feature, self.__train_label, self.__test_feature) print("mlp 100 oof complete !") mlp_unit_200_oof_train, mlp_unit_200_oof_test = __get_oof( mlp_unit_200, self.__train_feature, self.__train_label, self.__test_feature) print("mlp 200 oof complete !") mlp_unit_300_oof_train, mlp_unit_300_oof_test = __get_oof( mlp_unit_300, self.__train_feature, self.__train_label, self.__test_feature) print("mlp 300 oof complete !") # mlp_unit_5_100_oof_train, mlp_unit_5_100_oof_test = __get_oof( # mlp_unit_5_100, # self.__train_feature, # self.__train_label, # self.__test_feature # ) # print("mlp 5 100 oof complete !") # mlp_unit_5_300_oof_train, mlp_unit_5_300_oof_test = __get_oof( # mlp_unit_5_300, # self.__train_feature, # self.__train_label, # self.__test_feature # ) # print("mlp 5 300 oof complete !") # mlp_unit_5_900_oof_train, mlp_unit_5_900_oof_test = __get_oof( # mlp_unit_5_900, # self.__train_feature, # self.__train_label, # self.__test_feature # ) # print("mlp 5 900 oof complete !") self.__oof_train = np.hstack( (lr_p1_oof_train, lr_p2_oof_train, mlp_unit_100_oof_train, mlp_unit_200_oof_train, mlp_unit_300_oof_train)) self.__oof_test = np.hstack( (lr_p1_oof_test, lr_p2_oof_test, mlp_unit_100_oof_test, mlp_unit_200_oof_test, mlp_unit_300_oof_test)) def model_predict(self): self.__oof_train = pd.DataFrame(self.__oof_train, columns=[ "lr_p1", "lr_p2", "mlp_unit_100", "mlp_unit_200", "mlp_unit_300" ]) self.__oof_test = pd.DataFrame(self.__oof_test, columns=[ "lr_p1", "lr_p2", "mlp_unit_100", "mlp_unit_200", "mlp_unit_300" ]) self.__first_layer_train = self.__oof_train self.__first_layer_test = self.__oof_test self.__first_layer_train.to_csv(os.path.join( self.__output_path, "first_layer_linear_train.csv"), index=False) self.__first_layer_test.to_csv(os.path.join( self.__output_path, "first_layer_linear_test.csv"), index=False)
# Splitting Training Dataset into Test and Train from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7, random_state=100) # Installing category_encoders to import Target Encoder !pip install category_encoders # Importing Target Encoder from category_encoders import TargetEncoder # creating an object "te" for Target Encoder te=TargetEncoder() # Fitting Target Encoder on X_train and y_train (Training Data) te.fit(X_train,y_train) #Transforming X_train (Training Data) X_train=te.transform(X_train) #Transforming X_test (Training Data) X_test=te.transform(X_test) #Importing Logistic Regression from sklearn from sklearn.linear_model import LogisticRegression # Creating object for Logistic Regression lr=LogisticRegression() #Fitting Logistic Regression on X_train annd y_train (Training Data) lr.fit(X_train,y_train)
def preprocessing(data): #--- Drop columns where all values are missing. Do this first to try and save space ---# data.dropna(how='all', axis=1, inplace=True) ######################################################################################################################### # Creating Some Additional Variables ######################################################################################################################### print("Generating some domain knowledge features...") # Loan_to_income Ratio (LTV) data['LOAN_INCOME_RATIO'] = data['AMT_CREDIT'] / data['AMT_INCOME_TOTAL'] # Inc to Anuity Ratio data['ANNUITY_INCOME_RATIO'] = data['AMT_ANNUITY'] / data['AMT_INCOME_TOTAL'] # Income to Collateral Ratio data['COLLATERAL_INCOME_RATIO'] = data['AMT_GOODS_PRICE'] / data['AMT_INCOME_TOTAL'] # LTV data['LOAN_TO_VALUE_RATIO'] = data['AMT_CREDIT'] / data['AMT_GOODS_PRICE'] # Stats on the external scores data['EXT_SOURCE_MEAN'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].mean(axis=1) data['EXT_SOURCE_MIN'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].min(axis=1) data['EXT_SOURCE_MAX'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].max(axis=1) data['EXT_SOURCE_STD'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].std(axis=1) data['EXT_SOURCE_SKEW'] = data[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']].skew(axis=1) #Income to no. kids ratio data['INC_TO_KIDS'] = data['AMT_INCOME_TOTAL'] / (data['CNT_CHILDREN'] + 1) #Fraction of family kids data['PERC_KIDS'] = data['CNT_CHILDREN'] / data['CNT_FAM_MEMBERS'] #indebtendess*kids data['KIDS_AMT_ANNUITY_PRODUCT'] = data['CNT_CHILDREN']*data['AMT_ANNUITY'] #Fraction of life worked data['WORK_FRAC'] = data['DAYS_EMPLOYED'] / data['DAYS_BIRTH'] #Days old wehn got first car data['FIRST_CAR_DAYS'] = -data['DAYS_BIRTH'] - 365*data['OWN_CAR_AGE'] # Total ways in which customer can be contacted data['SUM_CONTACT'] = data['FLAG_MOBIL'] + data['FLAG_EMP_PHONE'] + data['FLAG_WORK_PHONE'] + data['FLAG_CONT_MOBILE'] + data['FLAG_PHONE'] + data['FLAG_EMAIL'] # Age income product data['AGE_INCOME_PROD'] = data['DAYS_BIRTH']*data['AMT_INCOME_TOTAL'] # Working Age income product data['EMPLOYED_INCOME_PROD'] = data['DAYS_EMPLOYED']*data['AMT_INCOME_TOTAL'] ######################################################################################################################### # Dealing with categorical data (columns with string or object values) - filling in missings and creating dummy variables ######################################################################################################################### print("filling in missing categorical data...") cat_data = data.select_dtypes(['object']) cat_col = list(data.select_dtypes(['object']).columns.values) cat_col.remove('Source') #--- Creating a list of categorical variabels with missing rows and filling in with string 'missing_' ---# cat_miss_col = cat_data.columns[cat_data.isna().any()].tolist() for item in cat_miss_col: data['%s' %(item)].fillna('missing_', inplace=True) encoder = TargetEncoder(verbose=0, impute_missing=True, return_df=False, smoothing=1) encoder.fit(X=data.loc[data['Source']=='Train',cat_col].values, y=data.loc[data['Source']=='Train',['TARGET']].values.reshape(-1,)) X = encoder.transform(X=data[cat_col].values) data.loc[:,cat_col] = pd.DataFrame(X, columns=cat_col, index=list(data.index)) ################################################################################################################################ # Dealing with floating values - imputing and then normalising ################################################################################################################################ print("Imputing, normalisig and scaling...") # Initialising preprocessing to normalise, scale data and impute missing data normaliser = Normalizer() scaler = StandardScaler(copy=True, with_mean=True, with_std=True) imputer = Imputer(axis=0, strategy="median", missing_values="NaN") # Method to impute, scale and transform numerical columns into gaussian distribution column_list = list(data.columns) column_list.remove('TARGET') column_list.remove('Source') print("Imputing....") imputer.fit(data.loc[data['Source']=='Train',column_list]) data.loc[:,column_list] = imputer.transform(data[column_list]) print("Scaling....") scaler.fit(data.loc[data['Source']=='Train',column_list]) data.loc[:,column_list] = scaler.transform(data[column_list]) print("Normalising....") normaliser.fit(data.loc[data['Source']=='Train',column_list]) data.loc[:,column_list] = normaliser.transform(data[column_list]) #--- Deleting all zero columns ---# data = data.loc[:, (data != 0).any(axis=0)] #--- Getting list of variables in the dataset and correlations ---# correlations = [] index = [] for col in list(data.columns): if col!='TARGET' and col!='Source': correlations.append(round(data['TARGET'].corr(data[col]),3)) index.append(col) correlations = pd.DataFrame(data=correlations, index=index, columns=['Correlation']) correlations['abs_Correlation'] = abs(correlations['Correlation']) correlations.to_csv("correlations.csv") ################################################################################################################################ # For highly correlated variables create a news variables ################################################################################################################################ # Creating products and divisor features for all numerical variables correlations = correlations[correlations['abs_Correlation']>0.05] num_vars = list(correlations.index.values) combinations = itertools.combinations(num_vars,2) print("Generating %s new features...." %(3*len(list(combinations)))) start_time_2 = time() iteration = 0 for i, j in itertools.combinations(num_vars,2): iteration+=1 data['PROD_%s__%s' %(i,j)] = data[i]*data[j] data['DIV_%s__%s' %(i,j)] = data[i]/data[j] data['DIV2_%s__%s' %(i,j)] = data[j]/data[i] print("iteration: %s, time:" %(iteration,time()-start_time_2)) correlations = [] index = [] for col in list(data.columns): if col!='TARGET' and col!='Source': correlations.append(round(data['TARGET'].corr(data[col]),3)) index.append(col) correlations = pd.DataFrame(data=correlations, index=index, columns=['Correlation']) correlations['abs_Correlation'] = abs(correlations['Correlation']) correlations.to_csv("correlations_2.csv") correlations.to_csv("C:\\Users\\Cemlyn\\OneDrive\\Python_Code_Repository\\correlations_2.csv") ''' print("Creating non-linear versions of numeric variables...") #--- If variable if float or number and non-binary then check for non-linear relationships ---# dtypes = list(set(data.dtypes))''' #--- Convert all int numbers to float - this will be memory intensive ---# ''' power_list = [1.0,2.0,3.0] for col in data: #--- if column is numeric and non-binary then create new versions if data[col].dtype!='object' and len(data[col].unique()) > 2: corr_list={} data["%s_sqrt" %(col)] = np.sqrt(np.abs(data[col])) corr_list['sqrt'] = data['TARGET'].corr(data["%s_sqrt" %(col)]) for power in power_list: data["%s_%s" %(col,power)] = np.power(data[col],power) corr = data['TARGET'].corr(data["%s_%s" %(col,power)]) corr_list[power] = corr #--- if non-linearised variable has higher correlation keep the non-linear form which has the highest correlation if data['TARGET'].corr(data[col])<max(corr_list.values()): data.drop("%s" %(col),axis=1,inplace=True) for x in corr_list.keys(): if x!=max(corr_list.values()): if ("%s_%s" %(col,power)) in data.columns: data.drop("%s_%s" %(col,power),axis=1,inplace=True) print(data.info()) ''' # Converting all int64 to int32 to save space. Might do the same with float64 ''' dtypes = list(set(data.dtypes)) for types in dtypes: df_type = list(data.select_dtypes(types).columns) if 'TARGET' in df_type: df_type.remove('TARGET') #Convert 64bit values to 32 to save space if types=='int64': for col in df_type: data[col] = data[col].astype('int32') if types=='float64': for col in df_type: data[col] = data[col].astype('float32') df_type = data.select_dtypes(types).columns ''' print(data.info()) data.to_pickle("Processed_DFS_Data_v03.pkl") data.to_pickle("C:\\Users\\Cemlyn\\OneDrive\\Python_Code_Repository\\Processed_DFS_Data_v03.pkl") #data[:1000].to_csv("Processed_DFS_Data_sampled.csv") return 0
test = pd.read_csv( "D:\PythonProjects\ML_Group_Data/tcd-ml-comp-201920-income-pred-group/test.csv" ) train_data = preprocessing(train) test_data = preprocessing(test) y = train_data[target] train_data.drop(target, axis=1, inplace=True) test_data.drop(target, axis=1, inplace=True) enc = TargetEncoder(cols=[ 'Gender', 'Country', 'Profession', 'University Degree', 'Housing Situation', 'Satisfation with employer' ]) enc.fit(train_data, y) train_data = enc.transform(train_data) test_data = enc.transform(test_data) train_data.head() test_data.head() #X_Train, X_Test, y_train, y_test = train_test_split(train_data, y, test_size=0.3, random_state=1) X_Train = train_data y_train = y y_train_log = np.log(y_train) training = lgb.Dataset(X_Train, y_train_log) params = {} params['learning_rate'] = 0.003 params['boosting_type'] = 'gbdt'
class LightGbmKfold(object): def __init__(self, *, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__sample_submission = None self.__train, self.__test = [None for _ in range(2)] self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_feature_stacking_tree, self.__test_feature_stacking_tree = [ None for _ in range(2) ] self.__train_feature_stacking_linear, self.__test_feature_stacking_linear = [ None for _ in range(2) ] self.__train_feature_stacking_network, self.__test_feature_stacking_network = [ None for _ in range(2) ] self.__train_feature_stacking_gp, self.__test_feature_stacking_gp = [ None for _ in range(2) ] self.__train_label = None self.__categorical_columns = None self.__encoder = None # model fit self.__folds = None self.__oof_preds = None self.__sub_preds = None self.__gbm = None # self.__metric_weight = [] def data_prepare(self): self.__sample_submission = pd.read_csv( os.path.join(self.__input_path, "sample_submission.csv")) # selected feature self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv")) self.__test = pd.read_csv( os.path.join(self.__input_path, "test_select_feature_df.csv")) # stacking tree self.__train_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_train.csv")) self.__test_feature_stacking_tree = pd.read_csv( os.path.join(self.__input_path, "first_layer_tree_test.csv")) # stacking linear self.__train_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_train.csv")) self.__test_feature_stacking_linear = pd.read_csv( os.path.join(self.__input_path, "first_layer_linear_test.csv")) # stacking network self.__train_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_train.csv")) self.__test_feature_stacking_network = pd.read_csv( os.path.join(self.__input_path, "first_layer_network_test.csv")) # gp self.__train_feature_stacking_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_train_feature.csv")) self.__test_feature_stacking_gp = pd.read_csv( os.path.join(self.__input_path, "genetic_test_feature.csv")) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop(["TARGET"] + [ col for col in self.__train.columns.tolist() if re.search(r"SK_ID", col) ], axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] self.__categorical_columns = self.__train_feature.select_dtypes( "object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit( self.__train_feature.loc[:, self.__categorical_columns], self.__train_label) self.__train_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform( self.__train_feature.loc[:, self.__categorical_columns])) self.__test_feature.loc[:, self.__categorical_columns] = ( self.__encoder.transform( self.__test_feature.loc[:, self.__categorical_columns])) self.__train_feature = pd.concat([ self.__train_feature, self.__train_feature_stacking_tree, self.__train_feature_stacking_linear, self.__train_feature_stacking_network, self.__train_feature_stacking_gp ], axis=1) self.__test_feature = pd.concat([ self.__test_feature, self.__test_feature_stacking_tree, self.__test_feature_stacking_linear, self.__test_feature_stacking_network, self.__test_feature_stacking_gp ], axis=1) def model_fit(self): self.__folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=8) self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0]) self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0]) # self.__sub_preds = np.zeros(shape=(self.__test_feature.shape[0], 5)) feature_importance_df = pd.DataFrame() for n_fold, (trn_idx, val_idx) in enumerate( self.__folds.split(self.__train_feature, self.__train_label)): trn_x, trn_y = self.__train_feature.iloc[ trn_idx], self.__train_label.iloc[trn_idx] val_x, val_y = self.__train_feature.iloc[ val_idx], self.__train_label.iloc[val_idx] self.__gbm = LGBMClassifier(colsample_bytree=0.6659, learning_rate=0.0197, max_depth=8, min_child_weight=1.0652, min_split_gain=0.058, n_estimators=501, num_leaves=11, reg_alpha=2.2487, reg_lambda=6.2587, subsample=0.9401) self.__gbm.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric="auc", verbose=True, early_stopping_rounds=5) pred_val = self.__gbm.predict_proba( val_x, num_iteration=self.__gbm.best_iteration_)[:, 1] pred_test = self.__gbm.predict_proba( self.__test_feature, num_iteration=self.__gbm.best_iteration_)[:, 1] self.__oof_preds[val_idx] = pred_val self.__sub_preds += pred_test / self.__folds.n_splits # self.__sub_preds[:, n_fold] = pred_test fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = pd.Series( self.__train_feature.columns) fold_importance_df["importance"] = self.__gbm.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) # 保存 weight # self.__metric_weight.append(roc_auc_score(val_y, self.__oof_preds[val_idx])) print( "Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx]))) feature_importance_df.to_csv(os.path.join(self.__output_path, "feature_importance.csv"), index=False) print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds)) def model_predict(self): # weight sum # self.__metric_weight = pd.Series(self.__metric_weight).rank() # self.__metric_weight = self.__metric_weight / self.__metric_weight.sum() # self.__metric_weight = self.__metric_weight.values.reshape((5, 1)) # self.__sub_preds = np.dot(self.__sub_preds, self.__metric_weight) self.__sample_submission["TARGET"] = self.__sub_preds self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)