Beispiel #1
0
def main(debug=False, use_pkl=False):
    num_rows = 10000 if debug else None
    with timer("Preprocessing"):
        df = get_df(num_rows) if not use_pkl else read_pickles('../output/df')
        to_pickles(df, '../output/df', split_size=30)
        print("df shape:", df.shape)
    with timer("Run LightGBM with kfold"):
        kfold_lightgbm(df, num_folds=NUM_FOLDS, stratified=False, debug=debug, use_pkl=use_pkl)
import lightgbm

from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold, StratifiedKFold

from Preprocessing_326 import get_df
from Learning_lgbm_326 import get_folds
from Utils import NUM_FOLDS, EXCLUDED_FEATURES, read_pickles, line_notify

# 以下参考
# https://github.com/fmfn/BayesianOptimization
# https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code

NUM_ROWS = None

TRAIN_DF = read_pickles('../output/train_df_agg')

# split test & train
#TRAIN_DF = DF[~DF['IS_TEST']]
FEATS = [
    f for f in TRAIN_DF.columns
    if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue_SUM']
]

lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS],
                              np.log1p(
                                  TRAIN_DF['totals.transactionRevenue_SUM']),
                              free_raw_data=False)


def lgbm_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_alpha,
def kfold_xgboost(df, num_folds, stratified=False, debug=False, use_pkl=False):

    # Divide in training/validation and test data
    train_df = df[~df['IS_TEST']]
    test_df = df[df['IS_TEST']]

    print("Starting XGBoost. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()

    ############################################################################
    # Session Level predictions
    ############################################################################

    print('Starting Session Level predictions...')

    # Cross validation model
    folds_session = get_folds(df=train_df, n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_session = np.zeros(train_df.shape[0])
    sub_preds_session = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns
        if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue']
    ]

    # final predict用にdmatrix形式のtest dfを作っておきます
    test_df_dmtrx = xgb.DMatrix(test_df[feats],
                                label=train_df['totals.transactionRevenue'])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_session):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(
            train_df['totals.transactionRevenue'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(
            train_df['totals.transactionRevenue'].iloc[valid_idx])

        # set data structure
        xgb_train = xgb.DMatrix(train_x, label=train_y)
        xgb_test = xgb.DMatrix(valid_x, label=valid_y)

        # params
        params = {
            'objective': 'gpu:reg:linear',  # GPU parameter
            'booster': 'gbtree',
            'eval_metric': 'rmse',
            'silent': 1,
            'eta': 0.01,
            'max_depth': 6,
            'min_child_weight': 19,
            'gamma': 0.479411416192221,
            'subsample': 0.976329169063721,
            'colsample_bytree': 0.921410871323335,
            'colsample_bylevel': 0.603858358771505,
            'alpha': 9.86942860885701,
            'lambda': 9.63581598065735,
            'tree_method': 'gpu_hist',  # GPU parameter
            'predictor': 'gpu_predictor',  # GPU parameter
            'seed': int(2**n_fold)
        }

        reg = xgb.train(params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train, 'train'), (xgb_test, 'test')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model('../output/models/xgb_session_' + str(n_fold) + '.txt')

        oof_preds_session[valid_idx] = np.expm1(reg.predict(xgb_test))
        sub_preds_session += np.expm1(reg.predict(test_df_dmtrx)) / num_folds

        fold_importance_df = pd.DataFrame.from_dict(
            reg.get_score(importance_type='gain'),
            orient='index',
            columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' %
              (n_fold + 1, rmse(valid_y, np.log1p(
                  oof_preds_session[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    del test_df_dmtrx
    gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_session = rmse(np.log1p(train_df['totals.transactionRevenue']),
                             np.log1p(oof_preds_session))
    line_notify('XGBoost Session Level Full RMSE score %.6f' %
                full_rmse_session)

    # session level feature importance
    display_importances(feature_importance_df,
                        '../output/xgb_importances_session.png',
                        '../output/feature_importance_xgb_session.csv')

    # 予測値を保存
    train_df.loc[:, 'predictions'] = oof_preds_session
    test_df.loc[:, 'predictions'] = sub_preds_session

    del oof_preds_session, sub_preds_session
    gc.collect()

    # csv形式でsave
    train_df['predictions'].to_csv("../output/oof_xgb_session.csv")
    test_df['predictions'].to_csv("../output/sub_xgb_session.csv")

    ############################################################################
    # User Level predictions
    ############################################################################

    print('Starting User Level predictions...')

    if use_pkl:

        del train_df, test_df
        gc.collect()

        # load pkl
        train_df_agg = read_pickles('../output/train_df_agg_xgb')
        test_df_agg = read_pickles('../output/test_df_agg_xgb')
    else:
        # Aggregate data at User level
        aggregations = {'totals.transactionRevenue': ['sum']}
        for col in feats + ['predictions']:
            aggregations[col] = ['sum', 'max', 'min', 'mean']

        train_df_agg = train_df[
            feats +
            ['fullVisitorId', 'totals.transactionRevenue', 'predictions'
             ]].groupby('fullVisitorId').agg(aggregations)
        del train_df
        gc.collect()

        test_df_agg = test_df[
            feats +
            ['fullVisitorId', 'totals.transactionRevenue', 'predictions'
             ]].groupby('fullVisitorId').agg(aggregations)
        del test_df
        gc.collect()

        # reshape header
        train_df_agg.columns = pd.Index(
            [e[0] + "_" + e[1].upper() for e in train_df_agg.columns.tolist()])
        test_df_agg.columns = pd.Index(
            [e[0] + "_" + e[1].upper() for e in test_df_agg.columns.tolist()])

        # to float32
        train_df_agg = train_df_agg.astype('float32')
        test_df_agg = test_df_agg.astype('float32')

        # save pkl
        to_pickles(train_df_agg,
                   '../output/train_df_agg_xgb',
                   split_size=50,
                   inplace=False)
        to_pickles(test_df_agg,
                   '../output/test_df_agg_xgb',
                   split_size=5,
                   inplace=False)

    # Cross validation model
    folds_agg = get_folds(df=train_df_agg[['totals.pageviews_MEAN'
                                           ]].reset_index(),
                          n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_agg = np.zeros(train_df_agg.shape[0])
    sub_preds_agg = np.zeros(test_df_agg.shape[0])
    feature_importance_df_agg = pd.DataFrame()
    feats_agg = [
        f for f in train_df_agg.columns
        if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue_SUM']
    ]

    # submit file生成用
    test_df_agg_index = test_df_agg.index

    # final predict用にdmatrix形式のtest dfを作っておきます
    test_df_agg = xgb.DMatrix(
        test_df_agg[feats_agg],
        label=test_df_agg['totals.transactionRevenue_SUM'])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_agg):
        train_x, train_y = train_df_agg[feats_agg].iloc[train_idx], np.log1p(
            train_df_agg['totals.transactionRevenue_SUM'].iloc[train_idx])
        valid_x, valid_y = train_df_agg[feats_agg].iloc[valid_idx], np.log1p(
            train_df_agg['totals.transactionRevenue_SUM'].iloc[valid_idx])

        # set data structure
        xgb_train = xgb.DMatrix(train_x, label=train_y)
        xgb_test = xgb.DMatrix(valid_x, label=valid_y)

        # gridsearchできないのでlightgbmと同じparamsを使います
        params = {
            'objective': 'gpu:reg:linear',  # GPU parameter
            'booster': 'gbtree',
            'eval_metric': 'rmse',
            'silent': 1,
            'eta': 0.01,
            'max_depth': 7,
            'min_child_weight': 0.14207610657307,
            'gamma': 0.46299516643071,
            'subsample': 0.740095188787127,
            'colsample_bytree': 0.698723156053225,
            'colsample_bylevel': 0.306359150497576,
            'alpha': 14.3019796761524,
            'lambda': 9.48248448679231,
            'tree_method': 'gpu_hist',  # GPU parameter
            'predictor': 'gpu_predictor',  # GPU parameter
            'seed': int(2**n_fold)
        }

        reg = xgb.train(params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train, 'train'), (xgb_test, 'test')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model('../output/models/xgb_user_' + str(n_fold) + '.txt')

        oof_preds_agg[valid_idx] = np.expm1(reg.predict(xgb_test))
        sub_preds_agg += np.expm1(reg.predict(test_df_agg)) / num_folds

        fold_importance_df = pd.DataFrame.from_dict(
            reg.get_score(importance_type='gain'),
            orient='index',
            columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df_agg = pd.concat(
            [feature_importance_df_agg, fold_importance_df], axis=0)

        print('Fold %2d RMSE : %.6f' %
              (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_agg[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y, fold_importance_df
        gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_agg = rmse(
        np.log1p(train_df_agg['totals.transactionRevenue_SUM']),
        np.log1p(oof_preds_agg))
    line_notify('Visitor Level Full RMSE score %.6f' % full_rmse_agg)

    # session level feature importance
    display_importances(feature_importance_df_agg,
                        '../output/xgb_importances_agg.png',
                        '../output/feature_importance_xgb_agg.csv')

    if not debug:
        # 提出データの予測値を保存
        submission = pd.DataFrame()
        submission['PredictedLogRevenue'] = sub_preds_agg
        submission.index = test_df_agg_index
        submission['PredictedLogRevenue'] = np.log1p(
            submission['PredictedLogRevenue'])
        submission['PredictedLogRevenue'] = submission[
            'PredictedLogRevenue'].apply(lambda x: 0.0 if x < 0 else x)
        submission['PredictedLogRevenue'] = submission[
            'PredictedLogRevenue'].fillna(0)
        submission.to_csv(submission_file_name, index=True)

        # out of foldの予測値を保存
        train_df_agg['OOF_PRED'] = oof_preds_agg
        train_df_agg[['OOF_PRED',
                      'totals.transactionRevenue_SUM']].to_csv(oof_file_name,
                                                               index=True)

        # API経由でsubmit
        submit(submission_file_name, comment='cv: %.6f' % full_rmse_agg)
Beispiel #4
0
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold, StratifiedKFold

from Preprocessing_326 import get_df
from Learning_lgbm_326 import get_folds
from Utils import NUM_FOLDS, EXCLUDED_FEATURES, read_pickles, line_notify

# 以下参考
# https://github.com/fmfn/BayesianOptimization
# https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code

NUM_ROWS = None

#DF = get_df(NUM_ROWS)
DF = read_pickles('../output/df')
#DF = read_pickles('../output/train_df_agg')

# split test & train
TRAIN_DF = DF[~DF['IS_TEST']]
FEATS = [
    f for f in TRAIN_DF.columns
    if f not in EXCLUDED_FEATURES + ['totals.transactionRevenue']
]

lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS],
                              np.log1p(TRAIN_DF['totals.transactionRevenue']),
                              free_raw_data=False)

folds = get_folds(df=TRAIN_DF, n_splits=NUM_FOLDS)
Beispiel #5
0
def kfold_lightgbm(df, num_folds, stratified = False, debug= False, use_pkl=False):

    # Divide in training/validation and test data
    train_df = df[~df['IS_TEST']]
    test_df = df[df['IS_TEST']]

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    ############################################################################
    # Session Level predictions
    ############################################################################

    print('Starting Session Level predictions...')

    # Cross validation model
    folds_session = get_folds(df=train_df, n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_session = np.zeros(train_df.shape[0])
    sub_preds_session = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in EXCLUDED_FEATURES+['totals.transactionRevenue']]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_session):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['totals.transactionRevenue'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['totals.transactionRevenue'].iloc[valid_idx])

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # パラメータは適当です
        params ={
                'device' : 'gpu',
#                'gpu_use_dp':True,
                'task': 'train',
                'boosting': 'gbdt',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'num_leaves': 64,
                'colsample_bytree': 0.769143040610826,
                'subsample': 0.295302403483027,
                'max_depth': 8,
                'reg_alpha': 9.37961252311552,
                'reg_lambda': 2.82500347706399,
                'min_split_gain': 0.153268455490808,
                'min_child_weight': 44,
                'min_data_in_leaf': 45,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        # save model
        reg.save_model('../output/models/lgbm_session_'+str(n_fold)+'.txt')

        oof_preds_session[valid_idx] = np.expm1(reg.predict(valid_x, num_iteration=reg.best_iteration))
        sub_preds_session += np.expm1(reg.predict(test_df[feats], num_iteration=reg.best_iteration)) / num_folds

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_session[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_session = rmse(np.log1p(train_df['totals.transactionRevenue']), np.log1p(oof_preds_session))
    line_notify('Session Level Full RMSE score %.6f' % full_rmse_session)

    # session level feature importance
    display_importances(feature_importance_df ,
                        '../output/lgbm_importances_session.png',
                        '../output/feature_importance_lgbm_session.csv')

    # 予測値を保存
    train_df.loc[:,'predictions'] = oof_preds_session
    test_df.loc[:,'predictions'] = sub_preds_session

    del oof_preds_session, sub_preds_session
    gc.collect()

    # csv形式でsave
    train_df['predictions'].to_csv("../output/oof_lgbm_session.csv")
    test_df['predictions'].to_csv("../output/sub_lgbm_session.csv")

    ############################################################################
    # User Level predictions
    ############################################################################

    print('Starting User Level predictions...')

    if use_pkl:

        del train_df, test_df
        gc.collect()

        # load pkl
        train_df_agg = read_pickles('../output/train_df_agg')
        test_df_agg = read_pickles('../output/test_df_agg')
    else:
        # Aggregate data at User level
        aggregations = {'totals.transactionRevenue': ['sum']}
        for col in feats+['predictions']:
            aggregations[col] = ['sum', 'max', 'min', 'mean']

        train_df_agg = train_df[feats+['fullVisitorId','totals.transactionRevenue', 'predictions']].groupby('fullVisitorId').agg(aggregations)
        del train_df
        gc.collect()

        test_df_agg = test_df[feats + ['fullVisitorId','totals.transactionRevenue', 'predictions']].groupby('fullVisitorId').agg(aggregations)
        del test_df
        gc.collect()

        # reshape header
        train_df_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in train_df_agg.columns.tolist()])
        test_df_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in test_df_agg.columns.tolist()])

        # to float32
        train_df_agg=train_df_agg.astype('float32')
        test_df_agg=test_df_agg.astype('float32')

        # save pkl
        to_pickles(train_df_agg, '../output/train_df_agg', split_size=50, inplace=False)
        to_pickles(test_df_agg, '../output/test_df_agg', split_size=5, inplace=False)

    # Cross validation model
    folds_agg = get_folds(df=train_df_agg[['totals.pageviews_MEAN']].reset_index(), n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds_agg = np.zeros(train_df_agg.shape[0])
    sub_preds_agg = np.zeros(test_df_agg.shape[0])
    feature_importance_df_agg = pd.DataFrame()
    feats_agg = [f for f in train_df_agg.columns if f not in EXCLUDED_FEATURES+['totals.transactionRevenue_SUM']]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds_agg):
        train_x, train_y = train_df_agg[feats_agg].iloc[train_idx], np.log1p(train_df_agg['totals.transactionRevenue_SUM'].iloc[train_idx])
        valid_x, valid_y = train_df_agg[feats_agg].iloc[valid_idx], np.log1p(train_df_agg['totals.transactionRevenue_SUM'].iloc[valid_idx])

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params estimated by bayesian opt
        params ={
                'device' : 'gpu',
#                'gpu_use_dp':True,
                'task': 'train',
                'boosting': 'gbdt',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'num_leaves': 36,
                'colsample_bytree': 0.174047605805866,
                'subsample': 0.702214902667035,
                'max_depth': 8,
                'reg_alpha': 9.91242460129322,
                'reg_lambda': 0.357672819483952,
                'min_split_gain': 0.631115489088361,
                'min_child_weight': 15,
                'min_data_in_leaf': 9,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        # save model
        reg.save_model('../output/models/lgbm_user_'+str(n_fold)+'.txt')

        oof_preds_agg[valid_idx] = np.expm1(reg.predict(valid_x, num_iteration=reg.best_iteration))
        sub_preds_agg += np.expm1(reg.predict(test_df_agg[feats_agg], num_iteration=reg.best_iteration)) / num_folds

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats_agg
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df_agg = pd.concat([feature_importance_df_agg, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, np.log1p(oof_preds_agg[valid_idx]))))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full RMSEスコアの表示&LINE通知
    full_rmse_agg = rmse(np.log1p(train_df_agg['totals.transactionRevenue_SUM']), np.log1p(oof_preds_agg))
    line_notify('Visitor Level Full RMSE score %.6f' % full_rmse_agg)

    # session level feature importance
    display_importances(feature_importance_df_agg ,
                        '../output/lgbm_importances_agg.png',
                        '../output/feature_importance_lgbm_agg.csv')

    if not debug:
        # 提出データの予測値を保存
        test_df_agg.loc[:,'PredictedLogRevenue'] = sub_preds_agg
        submission = test_df_agg[['PredictedLogRevenue']]
        submission['PredictedLogRevenue'] = np.log1p(submission['PredictedLogRevenue'])
        submission['PredictedLogRevenue'] =  submission['PredictedLogRevenue'].apply(lambda x : 0.0 if x < 0 else x)
        submission['PredictedLogRevenue'] = submission['PredictedLogRevenue'].fillna(0)
        submission.to_csv(submission_file_name, index=True)

        # out of foldの予測値を保存
        train_df_agg['OOF_PRED'] = oof_preds_agg
        train_df_agg[['OOF_PRED', 'totals.transactionRevenue_SUM']].to_csv(oof_file_name, index= True)

        # API経由でsubmit
        submit(submission_file_name, comment='cv: %.6f' % full_rmse_agg)