def train_and_predict(X_train, X_valid, y_train, y_valid, X_test, lgbm_params): # データセットの作成 y_train = np.log1p(y_train) y_valid = np.log1p(y_valid) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train) logging.debug(lgbm_params) # ロガーの作成 logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=30)] model = lgb.train( params=lgbm_params, train_set=lgb_train, valid_sets=lgb_eval, num_boost_round=8000, early_stopping_rounds=10, callbacks=callbacks ) y_pred = model.predict(X_test, num_iteration=model.best_iteration) y_pred = np.expm1(y_pred) y_pred[y_pred < 0] = 0 # lgb.plot_importance(model, importance_type="gain", max_num_features=20,figsize=(12,6)) plt.show() return y_pred, model
def train_and_predict(X_train, X_valid, y_train, y_valid, X_test, lgbm_params, oof): lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train) logging.debug(lgbm_params) logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=100)] num_round = 20000 model = lgb.train(lgbm_params, lgb_train, num_boost_round=num_round, valid_sets=[lgb_train, lgb_eval], verbose_eval=200, early_stopping_rounds=200, callbacks=callbacks) oof[X_valid.index] = model.predict(X_valid, num_iteration=model.best_iteration) y_pred = model.predict(X_test, num_iteration=model.best_iteration) return y_pred, model, oof
def train_and_predict(X_train, X_valid, y_train, y_valid, X_test, lgbm_params): # データセットを生成する lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train) logging.debug(lgbm_params) # ロガーの作成 logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=30)] # 上記のパラメータでモデルを学習する model = lgb.train( lgbm_params, lgb_train, # モデルの評価用データを渡す valid_sets=lgb_eval, # 最大で 1000 ラウンドまで学習する num_boost_round=1000, # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る early_stopping_rounds=10, # ログ callbacks=callbacks) # テストデータを予測する y_pred = model.predict(X_test, num_iteration=model.best_iteration) return y_pred, model
def fit(self): watchlist = [self.train, self.valid] callbacks = [logger.log_evaluation(logger, period=10)] self.model = lgb.train(self.params['model_params'], self.train, valid_sets=watchlist, callbacks=callbacks) return model
def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, seed=0, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, splits=None, n_folds=3, averaging='usual', n_jobs=-1, cat_cols="", valid_rate=0.8, groups=None): columns = X.columns if columns is None else columns n_splits = folds.n_splits if splits is None else n_folds X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'auc': { 'lgb_metric_name': eval_auc, 'catboost_metric_name': 'AUC', 'sklearn_scoring_function': metrics.roc_auc_score }, } result_dict = {} if averaging == 'usual': # out-of-fold predictions on train data oof = np.zeros((len(X), 1)) # averaged predictions on train data prediction = np.zeros((len(X_test), 1)) elif averaging == 'rank': # out-of-fold predictions on train data oof = np.zeros((len(X), 1)) # averaged predictions on train data prediction = np.zeros((len(X_test), 1)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() if folds == 'train_test_split_time_series': n_splits = 1 valid_rate = valid_rate train_index = np.arange(math.floor(X.shape[0] * valid_rate)) valid_index = np.arange(math.floor(X.shape[0] * valid_rate)) print(f'\ntrain started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] ###### models ################################################## if model_type == 'lgb': # logger logging.debug('\n\n=== lgb training =========') logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=500)] # modelの構築 model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=n_jobs, seed=seed) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, categorical_feature=cat_cols, callbacks=callbacks) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba( X_test, num_iteration=model.best_iteration_)[:, 1] # best score log_best(model) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier( iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=cat_cols, use_best_model=True, verbose=verbose) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba(X_test)[:, 1] ##### how to metric ################################################### if averaging == 'usual': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += y_pred.reshape(-1, 1) elif averaging == 'rank': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += pd.Series(y_pred).rank().values.reshape(-1, 1) if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) gc.collect() prediction /= n_splits print('\nCV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['prediction'] = prediction result_dict['scores'] = scores else: # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=groups)): print(f'\nFold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][ valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[ columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] ###### models ################################################## if model_type == 'lgb': # logger logging.debug('\n\n=== lgb training =========') logger = logging.getLogger('main') callbacks = [log_evaluation(logger, period=500)] # modelの構築 model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=n_jobs, seed=seed) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, categorical_feature=cat_cols, callbacks=callbacks) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba( X_test, num_iteration=model.best_iteration_)[:, 1] # best score log_best(model) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict( xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier( iterations=n_estimators, eval_metric=metrics_dict[eval_metric] ['catboost_metric_name'], **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=cat_cols, use_best_model=True, verbose=verbose) y_pred_valid = model.predict_proba(X_valid)[:, 1] y_pred = model.predict_proba(X_test)[:, 1] gc.collect() ##### how to metric ################################################### if averaging == 'usual': oof[valid_index] = y_pred_valid.reshape(-1, 1) scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += y_pred.reshape(-1, 1) elif averaging == 'rank': oof[valid_index] = y_pred_valid.reshape(-1, 1) scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) prediction += pd.Series(y_pred).rank().values.reshape(-1, 1) if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) gc.collect() prediction /= n_splits print('\nCV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores #################################################################### if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] # plt.figure(figsize=(16, 12)); # sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)); # plt.title('LGB Features (avg over folds)'); result_dict['feature_importance'] = feature_importance result_dict['top_columns'] = cols return result_dict