def train_and_predict_lightgbm(X_train_all, y_train_all, X_test): qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False) # 学習前にy_trainに、log(y+1)で変換 y_train_all = np.log(y_train_all + 1) # np.log1p() でもOK y_preds = [] models = [] for seed in SEED: kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) # lgbmの実行 lgbm = LightGBM() y_pred, y_valid_pred, model = lgbm.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, params) # 結果の保存 y_preds.append(y_pred) models.append(model) # スコア log_best(model, config['loss']) # CVスコア scores = [m.best_score['valid_0'][config['loss']] for m in models] score = sum(scores) / len(scores) print('===CV scores===') print(scores) print(score) logging.debug('===CV scores===') logging.debug(scores) logging.debug(score) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format( config['model'], now, score), index=False)
X_train, X_valid = ( X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :] ) y_train, y_valid = y_train_all[train_index], y_train_all[valid_index] # lgbmの実行 y_pred, model = train_and_predict( X_train, X_valid, y_train, y_valid, X_test, lgbm_params ) # 結果の保存 y_preds.append(y_pred) models.append(model) # スコア log_best(model, config['loss']) # CVスコア scores = [ m.best_score['valid_0'][config['loss']] for m in models ] score = sum(scores) / len(scores) print('===CV scores===') print(scores) print(score) logging.debug('===CV scores===') logging.debug(scores) logging.debug(score) # submitファイルの作成 ID_name = config['ID_name']
with timer(f'fold{fold_}', logging): X_train, X_valid = X_train_all[ folds['fold_id'] != fold_], X_train_all[folds['fold_id'] == fold_] y_train, y_valid = y_train_all[ folds['fold_id'] != fold_], y_train_all[folds['fold_id'] == fold_] y_pred, model, oof = train_and_predict(X_train, X_valid, y_train, y_valid, X_test, PARAMS, CAT, oof) # if CALIBRATION: # y_pred = calibration(y_pred, list_sampling_rate[fold_]) log_best(model, LOSS) y_preds.append(y_pred) models.append(model) with timer('save importances', logging): save_importances(models, X_train.columns, IMP_PATH, logging) with timer('calculate score', logging): scores = [round(m.best_score['valid_1'][LOSS], 3) for m in models] score = sum(scores) / len(scores) # with timer('transform to rank', logging): # for i, preds in enumerate(y_preds): # y_preds[i] = preds2rank(preds) / len(preds)
def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, seed=0, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3, averaging='usual', n_jobs=-1, cat_cols="", valid_rate=0.8, groups=None): columns = X.columns if columns is None else columns n_splits = folds.n_splits if splits is None else n_folds X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'auc': { 'lgb_metric_name': eval_auc, 'catboost_metric_name': 'AUC', 'sklearn_scoring_function': metrics.roc_auc_score }, } result_dict = {} if averaging == 'usual': # out-of-fold predictions on train data oof = np.zeros((len(X), 1)) # averaged predictions on train data prediction = np.zeros((len(X_test), 1)) elif averaging == 'rank': # out-of-fold predictions on train data oof = np.zeros((len(X), 1)) # averaged predictions on train data prediction = np.zeros((len(X_test), 1)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() if folds == 'train_test_split_time_series': n_splits = 1 valid_rate = valid_rate train_index = np.arange(math.floor(X.shape[0] * valid_rate)) valid_index = np.arange(math.floor(X.shape[0] * valid_rate)) print(f'\ntrain started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] ###### models ################################################## if model_type == 'lgb': # logger logging.debug('\n\n=== lgb training =========') logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=500)] # modelの構築 model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=n_jobs, seed=seed) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, categorical_feature=cat_cols, callbacks=callbacks) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba( X_test, num_iteration=model.best_iteration_)[:, 1] # best score log_best(model) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier( iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=cat_cols, use_best_model=True, verbose=verbose) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba(X_test)[:, 1] ##### how to metric ################################################### if averaging == 'usual': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += y_pred.reshape(-1, 1) elif averaging == 'rank': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += pd.Series(y_pred).rank().values.reshape(-1, 1) if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) gc.collect() prediction /= n_splits print('\nCV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['prediction'] = prediction result_dict['scores'] = scores else: # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=groups)): print(f'\nFold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][ valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[ columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] ###### models ################################################## if model_type == 'lgb': # logger logging.debug('\n\n=== lgb training =========') logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=500)] # modelの構築 model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=n_jobs, seed=seed) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, categorical_feature=cat_cols, callbacks=callbacks) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba( X_test, num_iteration=model.best_iteration_)[:, 1] # best score log_best(model) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict( xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier( iterations=n_estimators, eval_metric=metrics_dict[eval_metric] ['catboost_metric_name'], **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=cat_cols, use_best_model=True, verbose=verbose) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba(X_test)[:, 1] gc.collect() ##### how to metric ################################################### if averaging == 'usual': oof[valid_index] = y_pred_valid.reshape(-1, 1) scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += y_pred.reshape(-1, 1) elif averaging == 'rank': oof[valid_index] = y_pred_valid.reshape(-1, 1) scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += pd.Series(y_pred).rank().values.reshape(-1, 1) if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) gc.collect() prediction /= n_splits print('\nCV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores #################################################################### if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] # plt.figure(figsize=(16, 12)); # sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)); # plt.title('LGB Features (avg over folds)'); result_dict['feature_importance'] = feature_importance result_dict['top_columns'] = cols return result_dict