def main(debug=False, use_pkl=False): num_rows = 10000 if debug else None with timer("Preprocessing"): df = get_df(num_rows) if not use_pkl else read_pickles('../output/df') to_pickles(df, '../output/df', split_size=30) print("df shape:", df.shape) with timer("Run LightGBM with kfold"): kfold_lightgbm(df, num_folds=NUM_FOLDS, stratified=False, debug=debug, use_pkl=use_pkl)
import lightgbm from bayes_opt import BayesianOptimization from sklearn.model_selection import KFold, StratifiedKFold from Preprocessing_326 import get_df from Learning_lgbm_326 import get_folds from Utils import NUM_FOLDS, EXCLUDED_FEATURES, read_pickles, line_notify # 以下参考 # https://github.com/fmfn/BayesianOptimization # https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code NUM_ROWS = None TRAIN_DF = read_pickles('../output/train_df_agg') # split test & train #TRAIN_DF = DF[~DF['IS_TEST']] FEATS = [ f for f in TRAIN_DF.columns if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue_SUM'] ] lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS], np.log1p( TRAIN_DF['totals.transactionRevenue_SUM']), free_raw_data=False) def lgbm_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_alpha,
def kfold_xgboost(df, num_folds, stratified=False, debug=False, use_pkl=False): # Divide in training/validation and test data train_df = df[~df['IS_TEST']] test_df = df[df['IS_TEST']] print("Starting XGBoost. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() ############################################################################ # Session Level predictions ############################################################################ print('Starting Session Level predictions...') # Cross validation model folds_session = get_folds(df=train_df, n_splits=num_folds) # Create arrays and dataframes to store results oof_preds_session = np.zeros(train_df.shape[0]) sub_preds_session = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue'] ] # final predict用にdmatrix形式のtest dfを作っておきます test_df_dmtrx = xgb.DMatrix(test_df[feats], label=train_df['totals.transactionRevenue']) # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds_session): train_x, train_y = train_df[feats].iloc[train_idx], np.log1p( train_df['totals.transactionRevenue'].iloc[train_idx]) valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p( train_df['totals.transactionRevenue'].iloc[valid_idx]) # set data structure xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_test = xgb.DMatrix(valid_x, label=valid_y) # params params = { 'objective': 'gpu:reg:linear', # GPU parameter 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'eta': 0.01, 'max_depth': 6, 'min_child_weight': 19, 'gamma': 0.479411416192221, 'subsample': 0.976329169063721, 'colsample_bytree': 0.921410871323335, 'colsample_bylevel': 0.603858358771505, 'alpha': 9.86942860885701, 'lambda': 9.63581598065735, 'tree_method': 'gpu_hist', # GPU parameter 'predictor': 'gpu_predictor', # GPU parameter 'seed': int(2**n_fold) } reg = xgb.train(params, xgb_train, num_boost_round=10000, evals=[(xgb_train, 'train'), (xgb_test, 'test')], early_stopping_rounds=200, verbose_eval=100) # save model reg.save_model('../output/models/xgb_session_' + str(n_fold) + '.txt') oof_preds_session[valid_idx] = np.expm1(reg.predict(xgb_test)) sub_preds_session += np.expm1(reg.predict(test_df_dmtrx)) / num_folds fold_importance_df = pd.DataFrame.from_dict( reg.get_score(importance_type='gain'), orient='index', columns=['importance']) fold_importance_df["feature"] = fold_importance_df.index.tolist() fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p( oof_preds_session[valid_idx])))) del reg, train_x, train_y, valid_x, valid_y gc.collect() del test_df_dmtrx gc.collect() # Full RMSEスコアの表示&LINE通知 full_rmse_session = rmse(np.log1p(train_df['totals.transactionRevenue']), np.log1p(oof_preds_session)) line_notify('XGBoost Session Level Full RMSE score %.6f' % full_rmse_session) # session level feature importance display_importances(feature_importance_df, '../output/xgb_importances_session.png', '../output/feature_importance_xgb_session.csv') # 予測値を保存 train_df.loc[:, 'predictions'] = oof_preds_session test_df.loc[:, 'predictions'] = sub_preds_session del oof_preds_session, sub_preds_session gc.collect() # csv形式でsave train_df['predictions'].to_csv("../output/oof_xgb_session.csv") test_df['predictions'].to_csv("../output/sub_xgb_session.csv") ############################################################################ # User Level predictions ############################################################################ print('Starting User Level predictions...') if use_pkl: del train_df, test_df gc.collect() # load pkl train_df_agg = read_pickles('../output/train_df_agg_xgb') test_df_agg = read_pickles('../output/test_df_agg_xgb') else: # Aggregate data at User level aggregations = {'totals.transactionRevenue': ['sum']} for col in feats + ['predictions']: aggregations[col] = ['sum', 'max', 'min', 'mean'] train_df_agg = train_df[ feats + ['fullVisitorId', 'totals.transactionRevenue', 'predictions' ]].groupby('fullVisitorId').agg(aggregations) del train_df gc.collect() test_df_agg = test_df[ feats + ['fullVisitorId', 'totals.transactionRevenue', 'predictions' ]].groupby('fullVisitorId').agg(aggregations) del test_df gc.collect() # reshape header train_df_agg.columns = pd.Index( [e[0] + "_" + e[1].upper() for e in train_df_agg.columns.tolist()]) test_df_agg.columns = pd.Index( [e[0] + "_" + e[1].upper() for e in test_df_agg.columns.tolist()]) # to float32 train_df_agg = train_df_agg.astype('float32') test_df_agg = test_df_agg.astype('float32') # save pkl to_pickles(train_df_agg, '../output/train_df_agg_xgb', split_size=50, inplace=False) to_pickles(test_df_agg, '../output/test_df_agg_xgb', split_size=5, inplace=False) # Cross validation model folds_agg = get_folds(df=train_df_agg[['totals.pageviews_MEAN' ]].reset_index(), n_splits=num_folds) # Create arrays and dataframes to store results oof_preds_agg = np.zeros(train_df_agg.shape[0]) sub_preds_agg = np.zeros(test_df_agg.shape[0]) feature_importance_df_agg = pd.DataFrame() feats_agg = [ f for f in train_df_agg.columns if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue_SUM'] ] # submit file生成用 test_df_agg_index = test_df_agg.index # final predict用にdmatrix形式のtest dfを作っておきます test_df_agg = xgb.DMatrix( test_df_agg[feats_agg], label=test_df_agg['totals.transactionRevenue_SUM']) # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds_agg): train_x, train_y = train_df_agg[feats_agg].iloc[train_idx], np.log1p( train_df_agg['totals.transactionRevenue_SUM'].iloc[train_idx]) valid_x, valid_y = train_df_agg[feats_agg].iloc[valid_idx], np.log1p( train_df_agg['totals.transactionRevenue_SUM'].iloc[valid_idx]) # set data structure xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_test = xgb.DMatrix(valid_x, label=valid_y) # gridsearchできないのでlightgbmと同じparamsを使います params = { 'objective': 'gpu:reg:linear', # GPU parameter 'booster': 'gbtree', 'eval_metric': 'rmse', 'silent': 1, 'eta': 0.01, 'max_depth': 7, 'min_child_weight': 0.14207610657307, 'gamma': 0.46299516643071, 'subsample': 0.740095188787127, 'colsample_bytree': 0.698723156053225, 'colsample_bylevel': 0.306359150497576, 'alpha': 14.3019796761524, 'lambda': 9.48248448679231, 'tree_method': 'gpu_hist', # GPU parameter 'predictor': 'gpu_predictor', # GPU parameter 'seed': int(2**n_fold) } reg = xgb.train(params, xgb_train, num_boost_round=10000, evals=[(xgb_train, 'train'), (xgb_test, 'test')], early_stopping_rounds=200, verbose_eval=100) # save model reg.save_model('../output/models/xgb_user_' + str(n_fold) + '.txt') oof_preds_agg[valid_idx] = np.expm1(reg.predict(xgb_test)) sub_preds_agg += np.expm1(reg.predict(test_df_agg)) / num_folds fold_importance_df = pd.DataFrame.from_dict( reg.get_score(importance_type='gain'), orient='index', columns=['importance']) fold_importance_df["feature"] = fold_importance_df.index.tolist() fold_importance_df["fold"] = n_fold + 1 feature_importance_df_agg = pd.concat( [feature_importance_df_agg, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_agg[valid_idx])))) del reg, train_x, train_y, valid_x, valid_y, fold_importance_df gc.collect() # Full RMSEスコアの表示&LINE通知 full_rmse_agg = rmse( np.log1p(train_df_agg['totals.transactionRevenue_SUM']), np.log1p(oof_preds_agg)) line_notify('Visitor Level Full RMSE score %.6f' % full_rmse_agg) # session level feature importance display_importances(feature_importance_df_agg, '../output/xgb_importances_agg.png', '../output/feature_importance_xgb_agg.csv') if not debug: # 提出データの予測値を保存 submission = pd.DataFrame() submission['PredictedLogRevenue'] = sub_preds_agg submission.index = test_df_agg_index submission['PredictedLogRevenue'] = np.log1p( submission['PredictedLogRevenue']) submission['PredictedLogRevenue'] = submission[ 'PredictedLogRevenue'].apply(lambda x: 0.0 if x < 0 else x) submission['PredictedLogRevenue'] = submission[ 'PredictedLogRevenue'].fillna(0) submission.to_csv(submission_file_name, index=True) # out of foldの予測値を保存 train_df_agg['OOF_PRED'] = oof_preds_agg train_df_agg[['OOF_PRED', 'totals.transactionRevenue_SUM']].to_csv(oof_file_name, index=True) # API経由でsubmit submit(submission_file_name, comment='cv: %.6f' % full_rmse_agg)
from bayes_opt import BayesianOptimization from sklearn.model_selection import KFold, StratifiedKFold from Preprocessing_326 import get_df from Learning_lgbm_326 import get_folds from Utils import NUM_FOLDS, EXCLUDED_FEATURES, read_pickles, line_notify # 以下参考 # https://github.com/fmfn/BayesianOptimization # https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code NUM_ROWS = None #DF = get_df(NUM_ROWS) DF = read_pickles('../output/df') #DF = read_pickles('../output/train_df_agg') # split test & train TRAIN_DF = DF[~DF['IS_TEST']] FEATS = [ f for f in TRAIN_DF.columns if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue'] ] lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS], np.log1p(TRAIN_DF['totals.transactionRevenue']), free_raw_data=False) folds = get_folds(df=TRAIN_DF, n_splits=NUM_FOLDS)
def kfold_lightgbm(df, num_folds, stratified = False, debug= False, use_pkl=False): # Divide in training/validation and test data train_df = df[~df['IS_TEST']] test_df = df[df['IS_TEST']] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() ############################################################################ # Session Level predictions ############################################################################ print('Starting Session Level predictions...') # Cross validation model folds_session = get_folds(df=train_df, n_splits=num_folds) # Create arrays and dataframes to store results oof_preds_session = np.zeros(train_df.shape[0]) sub_preds_session = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in EXCLUDED_FEATURES+['totals.transactionRevenue']] # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds_session): train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['totals.transactionRevenue'].iloc[train_idx]) valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['totals.transactionRevenue'].iloc[valid_idx]) # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # パラメータは適当です params ={ 'device' : 'gpu', # 'gpu_use_dp':True, 'task': 'train', 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'num_leaves': 64, 'colsample_bytree': 0.769143040610826, 'subsample': 0.295302403483027, 'max_depth': 8, 'reg_alpha': 9.37961252311552, 'reg_lambda': 2.82500347706399, 'min_split_gain': 0.153268455490808, 'min_child_weight': 44, 'min_data_in_leaf': 45, 'verbose': -1, 'seed':int(2**n_fold), 'bagging_seed':int(2**n_fold), 'drop_seed':int(2**n_fold) } reg = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds= 200, verbose_eval=100 ) # save model reg.save_model('../output/models/lgbm_session_'+str(n_fold)+'.txt') oof_preds_session[valid_idx] = np.expm1(reg.predict(valid_x, num_iteration=reg.best_iteration)) sub_preds_session += np.expm1(reg.predict(test_df[feats], num_iteration=reg.best_iteration)) / num_folds fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_session[valid_idx])))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # Full RMSEスコアの表示&LINE通知 full_rmse_session = rmse(np.log1p(train_df['totals.transactionRevenue']), np.log1p(oof_preds_session)) line_notify('Session Level Full RMSE score %.6f' % full_rmse_session) # session level feature importance display_importances(feature_importance_df , '../output/lgbm_importances_session.png', '../output/feature_importance_lgbm_session.csv') # 予測値を保存 train_df.loc[:,'predictions'] = oof_preds_session test_df.loc[:,'predictions'] = sub_preds_session del oof_preds_session, sub_preds_session gc.collect() # csv形式でsave train_df['predictions'].to_csv("../output/oof_lgbm_session.csv") test_df['predictions'].to_csv("../output/sub_lgbm_session.csv") ############################################################################ # User Level predictions ############################################################################ print('Starting User Level predictions...') if use_pkl: del train_df, test_df gc.collect() # load pkl train_df_agg = read_pickles('../output/train_df_agg') test_df_agg = read_pickles('../output/test_df_agg') else: # Aggregate data at User level aggregations = {'totals.transactionRevenue': ['sum']} for col in feats+['predictions']: aggregations[col] = ['sum', 'max', 'min', 'mean'] train_df_agg = train_df[feats+['fullVisitorId','totals.transactionRevenue', 'predictions']].groupby('fullVisitorId').agg(aggregations) del train_df gc.collect() test_df_agg = test_df[feats + ['fullVisitorId','totals.transactionRevenue', 'predictions']].groupby('fullVisitorId').agg(aggregations) del test_df gc.collect() # reshape header train_df_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in train_df_agg.columns.tolist()]) test_df_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in test_df_agg.columns.tolist()]) # to float32 train_df_agg=train_df_agg.astype('float32') test_df_agg=test_df_agg.astype('float32') # save pkl to_pickles(train_df_agg, '../output/train_df_agg', split_size=50, inplace=False) to_pickles(test_df_agg, '../output/test_df_agg', split_size=5, inplace=False) # Cross validation model folds_agg = get_folds(df=train_df_agg[['totals.pageviews_MEAN']].reset_index(), n_splits=num_folds) # Create arrays and dataframes to store results oof_preds_agg = np.zeros(train_df_agg.shape[0]) sub_preds_agg = np.zeros(test_df_agg.shape[0]) feature_importance_df_agg = pd.DataFrame() feats_agg = [f for f in train_df_agg.columns if f not in EXCLUDED_FEATURES+['totals.transactionRevenue_SUM']] # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds_agg): train_x, train_y = train_df_agg[feats_agg].iloc[train_idx], np.log1p(train_df_agg['totals.transactionRevenue_SUM'].iloc[train_idx]) valid_x, valid_y = train_df_agg[feats_agg].iloc[valid_idx], np.log1p(train_df_agg['totals.transactionRevenue_SUM'].iloc[valid_idx]) # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # params estimated by bayesian opt params ={ 'device' : 'gpu', # 'gpu_use_dp':True, 'task': 'train', 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'num_leaves': 36, 'colsample_bytree': 0.174047605805866, 'subsample': 0.702214902667035, 'max_depth': 8, 'reg_alpha': 9.91242460129322, 'reg_lambda': 0.357672819483952, 'min_split_gain': 0.631115489088361, 'min_child_weight': 15, 'min_data_in_leaf': 9, 'verbose': -1, 'seed':int(2**n_fold), 'bagging_seed':int(2**n_fold), 'drop_seed':int(2**n_fold) } reg = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds= 200, verbose_eval=100 ) # save model reg.save_model('../output/models/lgbm_user_'+str(n_fold)+'.txt') oof_preds_agg[valid_idx] = np.expm1(reg.predict(valid_x, num_iteration=reg.best_iteration)) sub_preds_agg += np.expm1(reg.predict(test_df_agg[feats_agg], num_iteration=reg.best_iteration)) / num_folds fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats_agg fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df_agg = pd.concat([feature_importance_df_agg, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_agg[valid_idx])))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # Full RMSEスコアの表示&LINE通知 full_rmse_agg = rmse(np.log1p(train_df_agg['totals.transactionRevenue_SUM']), np.log1p(oof_preds_agg)) line_notify('Visitor Level Full RMSE score %.6f' % full_rmse_agg) # session level feature importance display_importances(feature_importance_df_agg , '../output/lgbm_importances_agg.png', '../output/feature_importance_lgbm_agg.csv') if not debug: # 提出データの予測値を保存 test_df_agg.loc[:,'PredictedLogRevenue'] = sub_preds_agg submission = test_df_agg[['PredictedLogRevenue']] submission['PredictedLogRevenue'] = np.log1p(submission['PredictedLogRevenue']) submission['PredictedLogRevenue'] = submission['PredictedLogRevenue'].apply(lambda x : 0.0 if x < 0 else x) submission['PredictedLogRevenue'] = submission['PredictedLogRevenue'].fillna(0) submission.to_csv(submission_file_name, index=True) # out of foldの予測値を保存 train_df_agg['OOF_PRED'] = oof_preds_agg train_df_agg[['OOF_PRED', 'totals.transactionRevenue_SUM']].to_csv(oof_file_name, index= True) # API経由でsubmit submit(submission_file_name, comment='cv: %.6f' % full_rmse_agg)