def make_predictions_cat(tr_df, tt_df, y, features_columns, cat_params, NFOLDS=6): folds = GroupKFold(n_splits=NFOLDS) split_groups = tr_df['DT_month'] X = tr_df[features_columns] X_test = tt_df[features_columns] pred_df = pd.DataFrame() pred_df['TransactionID'] = tt_df.reset_index()['TransactionID'] pred_df['isFraud'] = np.zeros(len(pred_df)) predictions = np.zeros(len(tt_df)) oof = np.zeros(len(tr_df)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)): tr_x, tr_y = X.iloc[trn_idx, :], y.iloc[trn_idx] vl_x, vl_y = X.iloc[val_idx, :], y.iloc[val_idx] print('Fold:', fold_, ' - ', len(tr_x), len(vl_x)) estimator = CatBoostClassifier(**cat_params) estimator.fit( X.iloc[trn_idx,:],y[trn_idx], eval_set=(X.iloc[val_idx,:], y[val_idx]), cat_features=categorical_features, use_best_model=True, verbose=True) oof_preds = estimator.predict_proba(X.iloc[val_idx,:])[:,1] oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min()) pp_p = estimator.predict(X_test) predictions += pp_p / NFOLDS feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature']) print(feature_imp) del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data gc.collect() pred_df['isFraud'] = predictions pred_df = pred_df.set_index('TransactionID') print('---------------------------------------') print('OOF AUC:', metrics.roc_auc_score(y, oof)) return pred_df, metrics.roc_auc_score(y, oof)
def train_model_generic(X, X_test, y, params, folds, model_type='lgb', n_fold=5, plot_feature_importance=True, averaging='usual', model=None): oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) scores = [] feature_importance = pd.DataFrame() for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)): print('Fold', fold_n, 'started at', time.ctime()) X_train, X_valid = X.loc[train_index], X.loc[valid_index] y_train, y_valid = y[train_index], y[valid_index] if model_type == 'lgb': train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_valid, label=y_valid) model = lgb.train(params, train_data, num_boost_round=20000, valid_sets=[train_data, valid_data], verbose_eval=1000, early_stopping_rounds=100 if TEST_RUN else 200) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_train.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X_train.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params) y_pred_valid = model.predict(xgb.DMatrix( X_valid, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict_proba(X_valid).reshape(-1, ) score = roc_auc_score(y_valid, y_pred_valid) # print(f'Fold {fold_n}. AUC: {score:.4f}.') # print('') y_pred = model.predict_proba(X_test)[:, 1] if model_type == 'glm': model = sm.GLM(y_train, X_train, family=sm.families.Binomial()) model_results = model.fit() model_results.predict(X_test) y_pred_valid = model_results.predict(X_valid).reshape(-1, ) score = roc_auc_score(y_valid, y_pred_valid) y_pred = model_results.predict(X_test) if model_type == 'cat': model = CatBoostClassifier(iterations=20000, learning_rate=0.05, loss_function='Logloss', eval_metric='AUC', **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba(X_test)[:, 1] oof[valid_index] = y_pred_valid.reshape(-1, ) scores.append(roc_auc_score(y_valid, y_pred_valid)) if averaging == 'usual': prediction += y_pred elif averaging == 'rank': prediction += pd.Series(y_pred).rank().values if model_type == 'lgb': # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = X.columns fold_importance["importance"] = model.feature_importance() fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) # display_html(eli5.show_weights(estimator=model, # feature_names=train_df.columns.values, top=50)) prediction /= n_fold print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) if model_type == 'lgb': feature_importance["importance"] /= n_fold if plot_feature_importance: cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') plt.show() return oof, prediction, feature_importance return oof, prediction, scores else: return oof, prediction, scores
def train_model(X, X_test, y, params, n_fold=5, shuffle_folds=True, model_type='lgb', plot_feature_importance=False, averaging='usual', model=None, folds_random_state=42): oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) auc_scores = [] f1_scores = [] recall_scores = [] precision_scores = [] accuracy_scores = [] feature_importance = pd.DataFrame() folds = StratifiedKFold(n_splits=n_fold, shuffle=shuffle_folds, random_state=folds_random_state) for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)): print('Fold', fold_n, 'started at', time.ctime()) X_train, X_valid = X.loc[train_index], X.loc[valid_index] y_train, y_valid = y[train_index], y[valid_index] if model_type == 'lgb': train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_valid, label=y_valid) model = lgb.train(params, train_data, num_boost_round=1000000, valid_sets=[train_data, valid_data], verbose_eval=1000, early_stopping_rounds=3000) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration) if model_type == 'lgb_sklearn': model = lgb.LGBMClassifier(**params, n_estimators=1000000) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=1000, early_stopping_rounds=3000) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba( X_test, num_iteration=model.best_iteration_)[:, 1] if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_train.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X_train.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params) y_pred_valid = model.predict(xgb.DMatrix( X_valid, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict_proba(X_valid).reshape(-1, ) score = roc_auc_score(y_valid, y_pred_valid) # print(f'Fold {fold_n}. AUC: {score:.4f}.') # print('') y_pred = model.predict_proba(X_test)[:, 1] if model_type == 'glm': model = sm.GLM(y_train, X_train, family=sm.families.Binomial()) model_results = model.fit() model_results.predict(X_test) y_pred_valid = model_results.predict(X_valid).reshape(-1, ) score = roc_auc_score(y_valid, y_pred_valid) y_pred = model_results.predict(X_test) if model_type == 'cat': model = CatBoostClassifier(iterations=20000, learning_rate=0.05, loss_function='Logloss', eval_metric='AUC', **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba(X_test)[:, 1] oof[valid_index] = y_pred_valid.reshape(-1, ) f1 = 0 best_t = 0 for t in np.arange(0.1, 1, 0.05): valid_pr = (y_pred_valid > t).astype(int) valid_f1 = metrics.f1_score(y_valid, valid_pr) if valid_f1 > f1: f1 = valid_f1 best_t = t t = best_t y_valid_pr = (y_pred_valid > t).astype(int) auc_scores.append(roc_auc_score(y_valid, y_pred_valid)) f1_scores.append(f1_score(y_valid, y_valid_pr)) precision_scores.append(precision_score(y_valid, y_valid_pr)) recall_scores.append(recall_score(y_valid, y_valid_pr)) accuracy_scores.append(accuracy_score(y_valid, y_valid_pr)) if averaging == 'usual': prediction += y_pred elif averaging == 'rank': prediction += pd.Series(y_pred).rank().values if model_type == 'lgb': # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = X.columns fold_importance["importance"] = model.feature_importance() fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction /= n_fold with open(os.path.join(src.models.__path__[0], 'model.pkl'), 'wb') as f: pickle.dump(model, f, protocol=2) scores = pd.DataFrame( { 'precision_score': np.mean(precision_scores), 'recall_score': np.mean(recall_scores), 'f1_score': np.mean(f1_scores), 'accuracy_score': np.mean(accuracy_scores), 'auc_score': np.mean(auc_scores), }, index=[0]) print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(auc_scores), np.std(auc_scores))) if model_type == 'lgb': feature_importance["importance"] /= n_fold if plot_feature_importance: cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') plt.savefig('feature_importance.png') return oof, prediction, scores, feature_importance return oof, prediction, scores, feature_importance else: return oof, prediction, scores, feature_importance