def train_xgb(plot=False): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) # kFold cv models = [] scores = [] checkpoint_predictions = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val params = { 'learning_rate': .05, 'n_estimators': 2000, 'max_depth': 8, 'min_child_weight': 4, 'gamma': .2, 'subsample': .8, 'colsample_bytree': .8, 'n_jobs': -1, 'random_state': 0 } model = XGBClassifier().set_params(**params) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50, verbose=False) ## plot feature importance if plot: fscores = pd.Series(model.feature_importances_, X_train.columns).sort_values(ascending=False) fscores.plot(kind='bar', title='Feature Importance %d' % i, figsize=(20, 10)) plt.ylabel('Feature Importance Score') plt.show() y_pred = model.predict_proba(X_val, ntree_limit=model.best_ntree_limit)[:, 1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % model.__class__.__name__) cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__) return data['y_pred']
def train_tree(model): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[tdx], y.loc[vdx] y_true = y_val # model = RandomForestClassifier() model.fit(X_train, y_train) y_pred = model.predict_proba(X_val)[:,1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % model.__class__.__name__) cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__) return data['y_pred']
def train_cat(plot=False): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val params = { 'learning_rate': .05, 'n_estimators': 2000, 'max_depth': 8, 'max_bin': 127, 'reg_lambda': 2, 'subsample': .7, 'one_hot_max_size': 2, 'bootstrap_type': 'Bernoulli', 'leaf_estimation_method': 'Newton', 'random_state': 0 } model = CatBoostClassifier().set_params(**params) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50, verbose=False) ## plot feature importance if plot: fscores = pd.Series(model.feature_importances_, X_train.columns).sort_values(ascending=False) fscores.plot(kind='bar', title='Feature Importance %d' % i, figsize=(20, 10)) plt.ylabel('Feature Importance Score') plt.show() y_pred = model.predict_proba(X_val)[:, 1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % model.__class__.__name__) cal_roc_curve(data['y'], data['y_pred'], model.__class__.__name__) return data['y_pred']
def model_stacking(): X, y = feature_engineering.get_train_data(use_over_sampler=True) data = pd.DataFrame(y) lgbm = LightGBM.train_lgbm(plot=False) print(lgbm.shape) cat = CatBoost.train_cat(plot=False) print(cat.shape) deepFM = DeepFM.train_DeepFM() print(deepFM.shape) X = pd.concat([lgbm, cat, deepFM], axis=1) print(X.shape) models = [] scores = [] checkpoint_predictions = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_val)[:, 1] auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % 'Stacking') cal_roc_curve(data['y'], data['y_pred'], 'Stacking') return data['y_pred']
global best_score, best_param if oof >= best_score: best_score = oof print("update best_score: %f" % best_score) best_param = params print("update best params: %s" % best_param) return oof best_score = -9999 best_param = {} flag = True X, y = None, None if __name__ == '__main__': X, y = feature_engineering.get_train_data(use_over_sampler=True) # 调参范围 adj_params = { 'min_child_samples': (3, 50), 'feature_fraction': (0.3, 1), # 'max_depth': (4, 15), 'num_leaves': (30, 1300), 'bagging_fraction': (0.3, 1), # 'bagging_freq': (1, 10), 'lambda_l2': (0.1, 2), # 'lambda_l1': (0.1, 1), # 'min_data_in_leaf': (1, 150) } # 调用贝叶斯优化