def main():
    # load submission files
    print('load files...')
    sub_weekday = pd.read_csv('../output/submission_lgbm_weekday.csv')
    sub_holiday = pd.read_csv('../output/submission_lgbm_holiday.csv')

    # load oof files
    oof_weekday = pd.read_csv('../output/oof_lgbm_cv_weekday.csv')
    oof_holiday = pd.read_csv('../output/oof_lgbm_cv_holiday.csv')

    # merge
    sub = sub_weekday.append(sub_holiday)

    oof = oof_weekday.append(oof_holiday)

    del sub_weekday, sub_holiday, oof_weekday, oof_holiday
    gc.collect()

    # to pivot
    print('to pivot...')
    sub = sub.pivot(index='id', columns='d', values='demand').reset_index()
    oof = oof.pivot(index='id', columns='d', values='demand').reset_index()

    # split test1 / test2
    sub1 = oof[['id'] + COLS_TEST1]
    sub2 = sub[['id'] + COLS_TEST2]

    # change column names
    sub1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    sub2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

    # replace test1 id
    sub1['id'] = sub1['id'].str.replace('_evaluation', '_validation')

    # merge
    sub = sub1.append(sub2)

    # postprocesssing
    cols_f = [f'F{i}' for i in range(1, 29)]
    cols_d = [c for c in oof.columns if 'd_' in c]

    sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0)
    oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0)

    # save csv
    sub.to_csv(submission_file_name, index=False)
    oof.to_csv(oof_file_name, index=False)

    # calc out of fold WRMSSE score
    print('calc oof cv scores...')
    scores = calc_score_cv(oof)
    score = np.mean(scores)
    print(f'scores: {scores}')

    # submission by API
    #    submit(submission_file_name, comment='model410 cv: %.6f' % score)

    # LINE notify
    line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))
def main():
    # load pkls
    df = read_pickles('../feats/sales_diff')
    df_calendar = loadpkl('../feats/calendar.pkl')
    df_sell_prices = loadpkl('../feats/sell_prices.pkl')

    # merge
    df = df.merge(df_calendar, on='d',how='left')
    df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left')

    del df_calendar, df_sell_prices
    gc.collect()

    # drop pre-release rows
    df = df[df['wm_yr_wk']>=df['release']]

    # make lag features
    df = make_lags(df,28)

    # label encoding
    cols_string = ['item_id','dept_id','cat_id','store_id','state_id']
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1,np.nan,inplace=True)

    # add price features
    df_grouped = df[['id','sell_price']].groupby('id')['sell_price']
    df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1))
    df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1'])
    df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max())
    df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365'])
    df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std())
    df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std())

    # features release date
    df['release'] = df['release'] - df['release'].min()

    # price momentum by month & year
    df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

    # days for CustomTimeSeriesSplitter
    df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save as feather
    to_feature(df, '../feats/f105')

    # save feature name list
    features_json = {'features':df.columns.tolist()}
    to_json(features_json,'../configs/105_all_features_diff.json')

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Esempio n. 3
0
def post_process(lp, img_path):
    if len(lp) == 0:
        msg = "Open Gate: " + AZURE + "\nLP : Not detected"
    else:
        msg = "Open Gate: " + AZURE + "\nLP : " + lp
    try:
        r = requests.post(TF_SERVING, files={'media': open(img_path, 'rb')})
        if r.status_code == 404:
            line_notify(msg, img_path, False)
        else:
            with open('lp.jpg', 'wb') as f:
                f.write(r.content)
            line_notify(msg, 'lp.jpg', False)
    except:
        print("Error sending request")
        pass
Esempio n. 4
0
def main(is_eval=False):
    # load csv
    df = pd.read_csv('../input/sell_prices.csv')

    # release week ref https://www.kaggle.com/kyakovlev/m5-simple-fe
    release_df = df.groupby(['store_id', 'item_id'
                             ])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id', 'item_id', 'release']

    # merge release week
    df = df.merge(release_df, on=['store_id', 'item_id'], how='left')

    # days from release
    df['days_from_release'] = df['wm_yr_wk'] - df['release']

    # basic aggregations
    df['price_max'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('max')
    df['price_min'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('min')
    df['price_std'] = df.groupby(['store_id',
                                  'item_id'])['sell_price'].transform('std')
    df['price_mean'] = df.groupby(['store_id',
                                   'item_id'])['sell_price'].transform('mean')

    # normalized price
    df['price_norm'] = df['sell_price'] / df['price_max']

    # label encoding
    df['price_nunique'] = df.groupby(['store_id', 'item_id'
                                      ])['sell_price'].transform('nunique')
    df['item_nunique'] = df.groupby(['store_id', 'sell_price'
                                     ])['item_id'].transform('nunique')

    # momentum
    df['price_momentum'] = df['sell_price'] / df.groupby(
        ['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1))

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save pkl
    save2pkl('../feats/sell_prices.pkl', df)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Esempio n. 5
0
def main():
    # submitファイルをロード
    sub = pd.read_csv("../input/sample_submit.tsv", sep='\t', header=None)
    sub_lgbm = pd.read_csv("../output/submission_lgbm.tsv",
                           sep='\t',
                           header=None)
    sub_xgb = pd.read_csv("../output/submission_xgb.tsv",
                          sep='\t',
                          header=None)

    # カラム名を変更
    sub.columns = ['index', 'visitors']
    sub_lgbm.columns = ['index', 'visitors']
    sub_xgb.columns = ['index', 'visitors']

    # merge
    sub.loc[:,
            'visitors'] = 0.5 * sub_lgbm['visitors'] + 0.5 * sub_xgb['visitors']

    del sub_lgbm, sub_xgb
    gc.collect()

    # out of foldの予測値をロード
    oof_lgbm = pd.read_csv("../output/oof_lgbm.csv")
    oof_xgb = pd.read_csv("../output/oof_xgb.csv")
    oof_preds = 0.5 * oof_lgbm['OOF_PRED'] + 0.5 * oof_xgb['OOF_PRED']

    # train_dfをロード
    train_df = loadpkl('../output/train_df.pkl')
    train_df = train_df.sort_values('index')

    # local cv scoreを算出
    local_mae = mean_absolute_error(train_df['visitors'], oof_preds)

    # LINE通知
    line_notify('Blend Local MAE score %.6f' % local_mae)

    del oof_lgbm, oof_xgb
    gc.collect()

    # save submit file
    sub[['index',
         'visitors']].sort_values('index').to_csv(submission_file_name,
                                                  index=False,
                                                  header=False,
                                                  sep='\t')
Esempio n. 6
0
def main(num_rows=None):
    # load csv & pkl
    profiles = pd.read_csv('../input/data_set_phase2/profiles.csv')

    # change columns name
    profiles.columns = ['pid']+['profile_{}'.format(i) for i in range(0,66)]

    # feature engineering
    feats = [f for f in profiles.columns.to_list() if f not in ['pid']]

    profiles['profile_sum'] = profiles[feats].mean(axis=1)
    profiles['profile_mean'] = profiles[feats].sum(axis=1)
    profiles['profile_std'] = profiles[feats].std(axis=1)

    profiles['profile_sum_count'] = profiles['profile_sum'].map(profiles['profile_sum'].value_counts())

    # svd features
    svd = TruncatedSVD(n_components=20, n_iter=20, random_state=326)
    svd_x = svd.fit_transform(profiles[feats].values)
    svd_x = pd.DataFrame(svd_x)
    svd_x.columns = ['profile_svd_{}'.format(i) for i in range(20)]
    svd_x['pid'] = profiles['pid']

    # merge
    profiles = profiles.merge(svd_x, on='pid', how='left')

    # NMF features
    nmf = NMF(n_components=20, init='random', random_state=326)
    nmf_x = nmf.fit_transform(profiles[feats].values)
    nmf_x = pd.DataFrame(nmf_x)
    nmf_x.columns = ['profile_nmf_{}'.format(i) for i in range(20)]
    nmf_x['pid'] = profiles['pid']

    # merge
    profiles = profiles.merge(nmf_x, on='pid', how='left')

    # k-means clustering
    kmeans_model = KMeans(n_clusters=10, random_state=326)
    kmeans_model.fit(profiles[feats].values)
    profiles['profile_k_means'] = kmeans_model.labels_

    # save as pkl
    save2pkl('../features/profiles.pkl', profiles)

    line_notify('{} finished.'.format(sys.argv[0]))
def main():
    # reg for bayesian optimization
    reg_bo = BayesianOptimization(
        xgb_eval, {
            'gamma': (0, 1),
            'max_depth': (3, 8),
            'min_child_weight': (0, 45),
            'subsample': (0.001, 1),
            'colsample_bytree': (0.001, 1),
            'colsample_bylevel': (0.001, 1),
            'alpha': (9, 20),
            '_lambda': (0, 10)
        })

    reg_bo.maximize(init_points=15, n_iter=25)

    res = pd.DataFrame(reg_bo.res['max']['max_params'], index=['max_params'])

    res.to_csv('../output/max_params_xgb.csv')

    line_notify('Bayes Opt XGBoost finished.')
Esempio n. 8
0
def main():

    # clf for bayesian optimization
    clf_bo = BayesianOptimization(
        lgbm_eval, {
            'num_leaves': (16, 64),
            'colsample_bytree': (0.001, 1),
            'subsample': (0.001, 1),
            'max_depth': (8, 16),
            'reg_alpha': (0, 10),
            'reg_lambda': (0, 10),
            'min_split_gain': (0, 1),
            'min_child_weight': (0, 45),
            'min_data_in_leaf': (0, 500),
        })

    clf_bo.maximize(init_points=15, n_iter=25)

    res = pd.DataFrame(clf_bo.res['max']['max_params'], index=['max_params'])

    res.to_csv('../output/max_params_lgbm.csv')

    line_notify('Bayes Opt LightGBM finished.')
def main():
    # load submission files
    print('load files...')
    sub = pd.read_csv(submission_file_name)

    # load out of fold files
    oof = pd.read_csv(oof_file_name)

    # to pivot
    print('to pivot...')
    oof = oof.pivot(index='id', columns='d', values='demand').reset_index()

    # fill na
    oof.fillna(0, inplace=True)

    # postprocesssing
    cols_f = [f'F{i}' for i in range(1, 29)]
    cols_d = [c for c in oof.columns if 'd_' in c]
    sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0)
    oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0)

    # save csv
    sub.to_csv(submission_file_name, index=False)
    oof.to_csv(oof_file_name_pivot, index=False)

    # calc out of fold WRMSSE score
    print('calc oof cv scores...')
    scores = calc_score_cv(oof)
    score = np.mean(scores)
    print(f'scores: {scores}')

    # submission by API
    #    submit(submission_file_name, comment='model401 cv: %.6f' % score)

    # LINE notify
    line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))
Esempio n. 10
0
def main(num_rows=None):
    # load csv
    train_queries = pd.read_csv(
        '../input/data_set_phase2/train_queries_phase2.csv', nrows=num_rows)
    test_queries = pd.read_csv('../input/data_set_phase2/test_queries.csv',
                               nrows=num_rows)
    train_clicks = pd.read_csv(
        '../input/data_set_phase2/train_clicks_phase2.csv')

    # phase 1 csv
    train_queries1 = pd.read_csv(
        '../input/data_set_phase2/train_queries_phase1.csv')
    train_clicks1 = pd.read_csv(
        '../input/data_set_phase2/train_clicks_phase1.csv')

    # merge click
    train_queries = pd.merge(train_queries,
                             train_clicks[['sid', 'click_mode']],
                             on='sid',
                             how='left')
    train_queries1 = pd.merge(train_queries1,
                              train_clicks1[['sid', 'click_mode']],
                              on='sid',
                              how='left')

    # merge phase 1 data
    train_queries = train_queries1.append(train_queries)

    # fill na (no click)
    train_queries['click_mode'].fillna(0, inplace=True)

    # set test target as nan
    test_queries['click_mode'] = np.nan

    # merge train & test
    queries_df = train_queries.append(test_queries)

    del train_queries, test_queries, train_queries1, train_clicks, train_clicks1
    gc.collect()

    # to datetime
    queries_df['req_time'] = pd.to_datetime(queries_df['req_time'])

    # distance features
    queries_df['x_o'] = queries_df['o'].apply(
        lambda x: x.split(',')[0]).astype(float)
    queries_df['y_o'] = queries_df['o'].apply(
        lambda x: x.split(',')[1]).astype(float)
    queries_df['x_d'] = queries_df['d'].apply(
        lambda x: x.split(',')[0]).astype(float)
    queries_df['y_d'] = queries_df['d'].apply(
        lambda x: x.split(',')[1]).astype(float)

    # count features
    queries_df['queries_o_count'] = queries_df['o'].map(
        queries_df['o'].value_counts())
    queries_df['queries_d_count'] = queries_df['d'].map(
        queries_df['d'].value_counts())

    queries_df['queries_x_o_count'] = queries_df['x_o'].map(
        queries_df['x_o'].value_counts())
    queries_df['queries_y_o_count'] = queries_df['y_o'].map(
        queries_df['y_o'].value_counts())
    queries_df['queries_x_d_count'] = queries_df['x_d'].map(
        queries_df['x_d'].value_counts())
    queries_df['queries_y_d_count'] = queries_df['y_d'].map(
        queries_df['y_d'].value_counts())

    queries_df['queries_distance'] = np.sqrt(
        (queries_df['x_o'] - queries_df['x_d'])**2 +
        (queries_df['y_o'] - queries_df['y_d'])**2)

    queries_df['o_d'] = queries_df['o'].astype(
        str) + '_' + queries_df['d'].astype(str)
    queries_df['queries_o_d_count'] = queries_df['o_d'].map(
        queries_df['o_d'].value_counts())

    # datetime features
    queries_df['queries_weekday'] = queries_df['req_time'].dt.weekday
    queries_df['queries_hour'] = queries_df['req_time'].dt.hour
    queries_df['queries_is_holiday'] = queries_df['req_time'].apply(
        lambda x: is_holiday(x)).astype(int)

    queries_df['queries_weekday_count'] = queries_df['queries_weekday'].map(
        queries_df['queries_weekday'].value_counts())
    queries_df['queries_hour_count'] = queries_df['queries_hour'].map(
        queries_df['queries_hour'].value_counts())

    # coordinate & datetime features
    queries_df['o_d_is_holiday'] = queries_df['queries_is_holiday'].astype(
        str) + '_' + queries_df['o_d']
    queries_df['o_d_weekday'] = queries_df['queries_weekday'].astype(
        str) + '_' + queries_df['o_d']
    queries_df['o_d_hour'] = queries_df['queries_hour'].astype(
        str) + '_' + queries_df['o_d']

    queries_df['o_is_holiday'] = queries_df['queries_is_holiday'].astype(
        str) + '_' + queries_df['o']
    queries_df['o_weekday'] = queries_df['queries_weekday'].astype(
        str) + '_' + queries_df['o']
    queries_df['o_hour'] = queries_df['queries_hour'].astype(
        str) + '_' + queries_df['o']

    queries_df['d_is_holiday'] = queries_df['queries_is_holiday'].astype(
        str) + '_' + queries_df['d']
    queries_df['d_weekday'] = queries_df['queries_weekday'].astype(
        str) + '_' + queries_df['d']
    queries_df['d_hour'] = queries_df['queries_hour'].astype(
        str) + '_' + queries_df['d']

    queries_df['queries_o_d_is_holiday_count'] = queries_df[
        'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
    queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map(
        queries_df['o_d_weekday'].value_counts())
    queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map(
        queries_df['o_d_hour'].value_counts())

    queries_df['queries_o_is_holiday_count'] = queries_df[
        'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
    queries_df['queries_o_weekday_count'] = queries_df['o_d_weekday'].map(
        queries_df['o_d_weekday'].value_counts())
    queries_df['queries_o_hour_count'] = queries_df['o_d_hour'].map(
        queries_df['o_d_hour'].value_counts())

    queries_df['queries_o_d_is_holiday_count'] = queries_df[
        'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts())
    queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map(
        queries_df['o_d_weekday'].value_counts())
    queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map(
        queries_df['o_d_hour'].value_counts())

    # rounded value features
    queries_df['x_o_round'] = queries_df['x_o'].round(1)
    queries_df['y_o_round'] = queries_df['y_o'].round(1)
    queries_df['x_d_round'] = queries_df['x_d'].round(1)
    queries_df['y_d_round'] = queries_df['y_d'].round(1)
    queries_df['queries_distance_round'] = queries_df[
        'queries_distance'].round(1)

    queries_df['o_round'] = queries_df['x_o_round'].astype(
        str) + '_' + queries_df['y_o_round'].astype(str)
    queries_df['d_round'] = queries_df['x_d_round'].astype(
        str) + '_' + queries_df['y_d_round'].astype(str)
    queries_df['o_d_round'] = queries_df['o_round'].astype(
        str) + '_' + queries_df['d_round'].astype(str)

    queries_df['queries_x_o_round_count'] = queries_df['x_o_round'].map(
        queries_df['x_o_round'].value_counts())
    queries_df['queries_y_o_round_count'] = queries_df['y_o_round'].map(
        queries_df['y_o_round'].value_counts())
    queries_df['queries_x_d_round_count'] = queries_df['x_d_round'].map(
        queries_df['x_d_round'].value_counts())
    queries_df['queries_y_d_round_count'] = queries_df['y_d_round'].map(
        queries_df['y_d_round'].value_counts())
    queries_df['queries_distance_round_count'] = queries_df[
        'queries_distance_round'].map(
            queries_df['queries_distance_round'].value_counts())
    queries_df['queries_o_round_count'] = queries_df['o_round'].map(
        queries_df['o_round'].value_counts())
    queries_df['queries_d_round_count'] = queries_df['d_round'].map(
        queries_df['d_round'].value_counts())
    queries_df['queries_o_d_round_count'] = queries_df['o_d_round'].map(
        queries_df['o_d_round'].value_counts())

    # factorize
    queries_df['x_o_round'], _ = pd.factorize(queries_df['x_o_round'])
    queries_df['y_o_round'], _ = pd.factorize(queries_df['y_o_round'])
    queries_df['x_d_round'], _ = pd.factorize(queries_df['x_d_round'])
    queries_df['y_d_round'], _ = pd.factorize(queries_df['y_d_round'])
    queries_df['queries_distance_round'], _ = pd.factorize(
        queries_df['queries_distance_round'])

    # target encoding
    cols_encoding = [
        'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round',
        'queries_distance_round'
    ]
    queries_df = targetEncodingMultiClass(queries_df, 'click_mode',
                                          cols_encoding)

    # drop string features
    cols_drop = [
        'o', 'd', 'o_d', 'o_d_is_holiday', 'o_d_weekday', 'o_d_hour',
        'o_is_holiday', 'o_weekday', 'o_hour', 'd_is_holiday', 'd_weekday',
        'd_hour', 'o_round', 'd_round', 'o_d_round'
    ]

    queries_df.drop(cols_drop, axis=1, inplace=True)

    # reduce memory usage
    queries_df = reduce_mem_usage(queries_df)

    # save as pkl
    save2pkl('../features/queries.pkl', queries_df)

    # save configs
    configs = json.load(open('../configs/101_lgbm_queries.json'))
    configs['features'] = queries_df.columns.to_list()
    to_json(configs, '../configs/101_lgbm_queries.json')

    line_notify('{} finished.'.format(sys.argv[0]))
Esempio n. 11
0
def main(is_eval=False):
    # load csv
    if is_eval:
        df = pd.read_csv('../input/sales_train_evaluation.csv')
    else:
        df = pd.read_csv('../input/sales_train_validation.csv')

    sub = pd.read_csv('../input/sample_submission.csv')

    # split test data
    sub['is_test1'] = sub['id'].apply(lambda x: True
                                      if '_validation' in x else False)
    sub['is_test2'] = sub['id'].apply(lambda x: True
                                      if '_evaluation' in x else False)

    test1 = sub[sub['is_test1']]
    test2 = sub[sub['is_test2']]

    del sub
    gc.collect()

    # drop flags
    test1.drop(['is_test1', 'is_test2'], axis=1, inplace=True)
    test2.drop(['is_test1', 'is_test2'], axis=1, inplace=True)

    # change column name
    test1.columns = ['id'] + COLS_TEST1
    test2.columns = ['id'] + COLS_TEST2

    # change id
    test2['id'] = test2['id'].str.replace('_evaluation', '_validation')

    # merge
    if not is_eval:
        df = df.merge(test1, on='id', how='left')

    df = df.merge(test2, on='id', how='left')

    del test1, test2
    gc.collect()

    # reduce memory usage
    df = reduce_mem_usage(df)

    # date columns
    cols_date = [c for c in df.columns if 'd_' in c]

    # melt sales data
    print('Melting sales data...')
    id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    df = pd.melt(df, id_vars=id_vars, var_name='d', value_name='demand')

    print('Melted sales train validation has {} rows and {} columns'.format(
        df.shape[0], df.shape[1]))

    # add numeric date
    df['d_numeric'] = df['d'].apply(lambda x: int(x[2:]))

    # drop old data (~2012/12/31)
    print('drop old data...')
    df = df[df['d_numeric'] >= 704]

    # drop christmas data
    print('drop christmas data...')
    df = df[df['d_numeric'] != 331]  # 2011-12-25
    df = df[df['d_numeric'] != 697]  # 2012-12-25
    df = df[df['d_numeric'] != 1062]  # 2013-12-25
    df = df[df['d_numeric'] != 1427]  # 2014-12-25
    df = df[df['d_numeric'] != 1792]  # 2015-12-25

    # add is zero flag
    df['is_zero'] = (df['demand'] == 0).astype(int)

    # save pkl
    to_pickles(df, '../feats/sales', split_size=3)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Esempio n. 12
0
                                  verbose_eval=100,
                                  seed=326,
                                 )
    gc.collect()
    return eval_dict['multi_logloss-mean'][-1]

if __name__ == '__main__':
    study = optuna.create_study()
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    # save result
    hist_df = study.trials_dataframe()
    hist_df.to_csv("../output/optuna_result_lgbm_queries.csv")

    # save json
    CONFIGS['params'] = trial.params
    to_json(CONFIGS, '../configs/101_lgbm_queries.json')

    line_notify('{} finished. Value: {}'.format(sys.argv[0],trial.value))
Esempio n. 13
0
        metrics=['rmse'],
        nfold=NUM_FOLDS,
        #                     folds=folds.split(TRAIN_DF[FEATS], TRAIN_DF['park_japanese_holiday']),
        num_boost_round=10000,  # early stopありなのでここは大きめの数字にしてます
        early_stopping_rounds=200,
        verbose_eval=100,
        seed=47)
    gc.collect()
    return clf['test-rmse-mean'][-1]

if __name__ == '__main__':
    study = optuna.create_study()
    study.optimize(objective, n_trials=30)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    # save result
    hist_df = study.trials_dataframe()
    hist_df.to_csv("../output/optuna_result_xgb.csv")

    line_notify('optuna XGBoost finished.')
Esempio n. 14
0
def kfold_xgboost(train_df, test_df, num_folds, stratified=False, debug=False):

    print("Starting XGBoost. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0], 12))
    sub_preds = np.zeros((test_df.shape[0], 12))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # dmatrix for test_df
    test_df_dmtrx = xgb.DMatrix(test_df[feats])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['click_mode'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'click_mode'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'click_mode'].iloc[valid_idx]

        # set data structure
        xgb_train = xgb.DMatrix(train_x, label=train_y)
        xgb_test = xgb.DMatrix(valid_x, label=valid_y)

        # params
        params = {
            'device': 'gpu',
            'objective': 'multi:softmax',  # GPU parameter
            'booster': 'gbtree',
            'eval_metric': 'mlogloss',
            'num_class': 12,
            'eta': 0.05,
            'colsample_bytree': 0.3490457769968177,
            'subsample': 0.543646263362097,
            'max_depth': 11,
            'alpha': 4.762312990232561,
            'lambda': 9.98131082276387,
            'gamma': 0.19161156850826594,
            'min_child_weight': 15.042054927368088,
            'tree_method': 'gpu_hist',  # GPU parameter
            'predictor': 'gpu_predictor',  # GPU parameter
            'silent': 1,
            'seed': int(2**n_fold)
        }

        # train model
        clf = xgb.train(params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train, 'train'), (xgb_test, 'test')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        clf.save_model('../output/xgb_' + str(n_fold) + '.txt')

        oof_preds[valid_idx] = clf.predict(xgb_test, output_margin=True)
        sub_preds += clf.predict(test_df_dmtrx,
                                 output_margin=True) / folds.n_splits

        # save feature importances
        fold_importance_df = pd.DataFrame.from_dict(
            clf.get_score(importance_type='gain'),
            orient='index',
            columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d F1 Score : %.6f' %
              (n_fold + 1,
               f1_score(valid_y,
                        np.argmax(oof_preds[valid_idx], axis=1),
                        average='weighted')))
        del clf, train_x, train_y, valid_x, valid_y, xgb_train, xgb_test
        gc.collect()

    # Full F1 Score & LINE Notify
    full_f1 = f1_score(train_df['click_mode'],
                       np.argmax(oof_preds, axis=1),
                       average='weighted')
    print('Full F1 Score %.6f' % full_f1)
    line_notify('Full F1 Score %.6f' % full_f1)

    # display importances
    display_importances(feature_importance_df, '../imp/xgb_importances.png',
                        '../imp/feature_importance_xgb.csv')

    if not debug:
        # save prediction for submit
        test_df['recommend_mode'] = np.argmax(sub_preds, axis=1)
        test_df = test_df.reset_index()

        # post processing
        test_df['recommend_mode'][(test_df['plan_num_plans'] == 1)
                                  & (test_df['recommend_mode'] != 0
                                     )] = test_df['plan_0_transport_mode'][
                                         (test_df['plan_num_plans'] == 1)
                                         & (test_df['recommend_mode'] != 0)]

        test_df[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                                  index=False)

        # save out of fold prediction
        train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1)
        train_df = train_df.reset_index()
        train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                                 index=False)

        # save prediction for submit
        sub_preds = pd.DataFrame(sub_preds)
        sub_preds.columns = [
            'pred_xgb_plans{}'.format(c) for c in sub_preds.columns
        ]
        sub_preds['sid'] = test_df['sid']
        sub_preds['click_mode'] = test_df['click_mode']

        # save out of fold prediction
        oof_preds = pd.DataFrame(oof_preds)
        oof_preds.columns = [
            'pred_xgb_plans{}'.format(c) for c in oof_preds.columns
        ]
        oof_preds['sid'] = train_df['sid']
        oof_preds['click_mode'] = train_df['click_mode']

        # merge
        df = oof_preds.append(sub_preds)

        # save as pkl
        save2pkl('../features/xgb_pred.pkl', df)

        line_notify('{} finished.'.format(sys.argv[0]))
Esempio n. 15
0
def main():
    # load predictions
    pred_lgbm = loadpkl('../features/lgbm_pred.pkl')
    pred_xgb = loadpkl('../features/xgb_pred.pkl')
    plans = loadpkl('../features/plans.pkl')

    # define columns name list
    cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)]
    cols_pred_xgb = ['pred_xgb_plans{}'.format(i) for i in range(0, 12)]
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    # merge plans & pred
    pred = pred_lgbm[['sid', 'click_mode']]
    pred = pd.merge(pred,
                    plans[cols_transport_mode + ['sid', 'plan_num_plans']],
                    on='sid',
                    how='left')

    del plans
    gc.collect()

    # scaling predictions
    pred_lgbm[cols_pred_lgbm] = scalingPredictions(pred_lgbm[cols_pred_lgbm])
    pred_xgb[cols_pred_xgb] = scalingPredictions(pred_xgb[cols_pred_xgb])

    # reset index
    pred_lgbm.reset_index(inplace=True, drop=True)
    pred_xgb.reset_index(inplace=True, drop=True)

    # fill predictions for non-exist plans as zero
    for i in range(1, 12):
        tmp = np.zeros(len(pred))
        for c in cols_transport_mode:
            tmp += (pred[c] == i).astype(int)
        pred_lgbm['pred_lgbm_plans{}'.format(
            i)] = pred_lgbm['pred_lgbm_plans{}'.format(i)] * (tmp > 0)
        pred_xgb['pred_xgb_plans{}'.format(
            i)] = pred_xgb['pred_xgb_plans{}'.format(i)] * (tmp > 0)

    # get best weight for lgbm & xgboost
    oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()]
    oof_pred_xgb = pred_xgb[pred_xgb['click_mode'].notnull()]

    w = getBestWeights(oof_pred_lgbm.click_mode, oof_pred_lgbm, oof_pred_xgb,
                       '../imp/weight.png')

    # calc prediction for each class
    cols_pred = []
    for i in range(0, 12):
        pred['pred_{}'.format(i)] = w * pred_lgbm['pred_lgbm_plans{}'.format(
            i)] + (1.0 - w) * pred_xgb['pred_xgb_plans{}'.format(i)]
        cols_pred.append('pred_{}'.format(i))

    # get out of fold values
    oof_pred = pred[pred['click_mode'].notnull()]

    # get best multiples
    m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred, '../imp/multiple4.png')
    pred['pred_4'] *= m4
    oof_pred['pred_4'] *= m4

    m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred, '../imp/multiple0.png')
    pred['pred_0'] *= m0
    oof_pred['pred_0'] *= m0

    m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred, '../imp/multiple3.png')
    pred['pred_3'] *= m3
    oof_pred['pred_3'] *= m3

    m6 = getBestMultiple(oof_pred, 'pred_6', cols_pred, '../imp/multiple6.png')
    pred['pred_6'] *= m6
    oof_pred['pred_6'] *= m6

    # get recommend mode
    pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1)

    # if number of plans = 1 and recommend mode != 0, set recommend mode as plan 0 mode.
    pred['recommend_mode'][(pred['plan_num_plans'] == 1) & (
        pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][
            (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)]

    # split train & test
    sub_pred = pred[pred['click_mode'].isnull()]
    oof_pred = pred[pred['click_mode'].notnull()]

    # out of fold score
    oof_f1_score = f1_score(oof_pred['click_mode'],
                            oof_pred['recommend_mode'],
                            average='weighted')

    # save csv
    oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                             index=False)
    sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                               index=False)

    # line notify
    line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))
Esempio n. 16
0
def kfold_lightgbm(train_df, test_df, num_folds):
    print('Starting LightGBM. Train shape: {}'.format(train_df.shape))

    # Cross validation
    folds = CustomTimeSeriesSplitter(end_train=1941)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    valid_idxs=[]
    avg_best_iteration = 0 # average of best iteration

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df)):
        # split train/valid
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['demand'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['demand'].iloc[valid_idx]

        # save validation indexes
        valid_idxs += list(valid_idx)

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)

        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        params ={
#                'device' : 'gpu',
#                'gpu_use_dp':True,
                'boosting': 'gbdt',
                'metric': ['rmse'],
                'objective':'tweedie',
                'learning_rate': 0.05,
                'tweedie_variance_power':1.1,
                'subsample': 0.5,
                'subsample_freq': 1,
                'num_leaves': 2**8-1,
                'min_data_in_leaf': 2**8-1,
                'feature_fraction': 0.8,
                'verbose': -1,
                'seed':326,
                'bagging_seed':326,
                'drop_seed':326,
                'num_threads':-1
                }

        # train model
        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=10
                        )

        # save model
        reg.save_model(f'../output/lgbm_holiday_{n_fold}.txt')

        # save predictions
        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)

        # save best iteration
        avg_best_iteration += reg.best_iteration / folds.n_splits

        # save feature importances
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = feats
        fold_importance_df['importance'] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df['fold'] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # display importances
    display_importances(feature_importance_df,
                        '../imp/lgbm_importances_cv_holiday.png',
                        '../imp/feature_importance_lgbm_cv_holiday.csv')

    # Full RMSE score and LINE Notify
    full_rmse = rmse(train_df['demand'][valid_idxs], oof_preds[valid_idxs])
    line_notify('Full RMSE score %.6f' % full_rmse)

    # save out of fold prediction
    train_df.loc[:,'demand'] = oof_preds
    train_df[['id','d','demand']].to_csv(oof_file_name, index=False)

    # save number of best iteration
    configs['num_boost_round'] = int(avg_best_iteration)
    configs['rmse'] = full_rmse
    to_json(configs, '../configs/310_train_holiday.json')

    # LINE notify
    line_notify('{} done. best iteration:{}'.format(sys.argv[0],int(avg_best_iteration)))
Esempio n. 17
0
def kfold_lightgbm(train_df,
                   test_df,
                   num_folds,
                   stratified=False,
                   debug=False):

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0], 12))
    sub_preds = np.zeros((test_df.shape[0], 12))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['click_mode'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'click_mode'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'click_mode'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                categorical_feature=CAT_COLS,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               categorical_feature=CAT_COLS,
                               free_raw_data=False)

        # params
        params = {
            'device': 'gpu',
            'task': 'train',
            'boosting': 'gbdt',
            'objective': 'multiclass',
            'metric': 'multiclass',
            'learning_rate': 0.01,
            'num_class': 12,
            'num_leaves': 52,
            'colsample_bytree': 0.3490457769968177,
            'subsample': 0.543646263362097,
            'max_depth': 11,
            'reg_alpha': 4.762312990232561,
            'reg_lambda': 9.98131082276387,
            'min_split_gain': 0.19161156850826594,
            'min_child_weight': 15.042054927368088,
            'min_data_in_leaf': 17,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        clf = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_train, lgb_test],
            valid_names=['train', 'test'],
            #                        feval=eval_f,
            num_boost_round=10000,
            early_stopping_rounds=200,
            verbose_eval=100)

        # save model
        clf.save_model('../output/lgbm_3_{}.txt'.format(n_fold))

        oof_preds[valid_idx] = clf.predict(valid_x,
                                           num_iteration=clf.best_iteration)
        sub_preds += clf.predict(
            test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            clf.feature_importance(importance_type='gain',
                                   iteration=clf.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F1 Score : %.6f' %
              (n_fold + 1,
               f1_score(valid_y,
                        np.argmax(oof_preds[valid_idx], axis=1),
                        average='weighted')))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full F1 Score & LINE Notify
    full_f1 = f1_score(train_df['click_mode'],
                       np.argmax(oof_preds, axis=1),
                       average='weighted')
    print('Full F1 Score %.6f' % full_f1)
    line_notify('Full F1 Score %.6f' % full_f1)

    # display importances
    display_importances(feature_importance_df, '../imp/lgbm_importances_3.png',
                        '../imp/feature_importance_lgbm_3.csv')

    if not debug:
        # save prediction for submit
        test_df['recommend_mode'] = np.argmax(sub_preds, axis=1)
        test_df = test_df.reset_index()

        # post processing
        test_df['recommend_mode'][(test_df['plan_num_plans'] == 1)
                                  & (test_df['recommend_mode'] != 0
                                     )] = test_df['plan_0_transport_mode'][
                                         (test_df['plan_num_plans'] == 1)
                                         & (test_df['recommend_mode'] != 0)]

        # save csv
        test_df[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                                  index=False)

        # save out of fold prediction
        train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1)
        train_df = train_df.reset_index()
        train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                                 index=False)

        # save prediction for submit
        sub_preds = pd.DataFrame(sub_preds)
        sub_preds.columns = [
            'pred_lgbm_plans{}'.format(c) for c in sub_preds.columns
        ]
        sub_preds['sid'] = test_df['sid']
        sub_preds['click_mode'] = test_df['click_mode']

        # save out of fold prediction
        oof_preds = pd.DataFrame(oof_preds)
        oof_preds.columns = [
            'pred_lgbm_plans{}'.format(c) for c in oof_preds.columns
        ]
        oof_preds['sid'] = train_df['sid']
        oof_preds['click_mode'] = train_df['click_mode']

        # merge
        df = oof_preds.append(sub_preds)

        # save as pkl
        save2pkl('../features/lgbm_pred_3.pkl', df)

        line_notify('{} finished.'.format(sys.argv[0]))
def kfold_lightgbm(train_df, test_df, num_folds):
    print('Starting LightGBM. Train shape: {}'.format(train_df.shape))

    # Cross validation
    folds = GroupKFold(n_splits=num_folds)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    group = train_df['month'].astype(str) + '_' + train_df['year'].astype(str)

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], groups=group)):
        # split train/valid
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'demand'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'demand'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False)

        lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

        params = {
            #                'device' : 'gpu',
            #                'gpu_use_dp':True,
            'boosting': 'gbdt',
            'metric': ['rmse'],
            'objective': 'tweedie',
            'learning_rate': 0.05,
            'tweedie_variance_power': 1.1,
            'subsample': 0.5,
            'subsample_freq': 1,
            'num_leaves': 2**8 - 1,
            'min_data_in_leaf': 2**8 - 1,
            'feature_fraction': 0.8,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold),
            'num_threads': -1
        }

        # train model
        reg = lgb.train(params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model(f'../output/lgbm_group_k_fold_21days_{n_fold}.txt')

        # save predictions
        oof_preds[valid_idx] = reg.predict(valid_x,
                                           num_iteration=reg.best_iteration)
        sub_preds += reg.predict(
            test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        # save feature importances
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = feats
        fold_importance_df['importance'] = np.log1p(
            reg.feature_importance(importance_type='gain',
                                   iteration=reg.best_iteration))
        fold_importance_df['fold'] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d RMSE : %.6f' %
              (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # display importances
    display_importances(
        feature_importance_df,
        '../imp/lgbm_importances_group_k_fold_21days.png',
        '../imp/feature_importance_lgbm_group_k_fold_21days.csv')

    # Full RMSE score and LINE Notify
    full_rmse = rmse(train_df['demand'], oof_preds)
    line_notify('Full RMSE score %.6f' % full_rmse)

    # save out of fold prediction
    train_df.loc[:, 'demand'] = oof_preds
    train_df = train_df.reset_index()
    train_df[['id', 'd', 'demand']].to_csv(oof_file_name, index=False)

    # reshape prediction for submit
    test_df.loc[:, 'demand'] = sub_preds
    test_df = test_df.reset_index()
    preds = test_df[['id', 'd', 'demand']].reset_index()

    # save csv
    preds.to_csv(submission_file_name, index=False)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Esempio n. 19
0
def main():
    # load predictions
    pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')
    pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl')
    pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl')
    plans = read_pickles('../features/plans')
    preds = [pred_lgbm1, pred_lgbm2, pred_lgbm3]

    # define columns name list
    cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)]
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    # remove columns
    cols_drop = [
        c for c in plans.columns if c not in cols_transport_mode +
        ['sid', 'plan_num_plans', 'click_mode']
    ]
    plans.drop(cols_drop, axis=1, inplace=True)

    # postprocessing
    sub_preds = []
    oof_preds = []
    for i, pred_lgbm in enumerate(preds):

        # merge plans & pred
        pred = pred_lgbm[['sid', 'click_mode']]
        pred = pd.merge(pred,
                        plans[cols_transport_mode + ['sid', 'plan_num_plans']],
                        on='sid',
                        how='left')

        # scaling predictions
        pred_lgbm[cols_pred_lgbm] = scalingPredictions(
            pred_lgbm[cols_pred_lgbm])

        # reset index
        pred_lgbm.reset_index(inplace=True, drop=True)

        # fill predictions for non-exist plans as zero
        for j in range(1, 12):
            tmp = np.zeros(len(pred))
            for c in cols_transport_mode:
                tmp += (pred[c] == j).astype(int)
            pred_lgbm['pred_lgbm_plans{}'.format(
                j)] = pred_lgbm['pred_lgbm_plans{}'.format(j)] * (tmp > 0)

        # get best weight for lgbm & xgboost
        oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()]

        # calc prediction for each class
        cols_pred = []
        for j in range(0, 12):
            pred['pred_{}'.format(j)] = pred_lgbm['pred_lgbm_plans{}'.format(
                j)]
            cols_pred.append('pred_{}'.format(j))

        # get out of fold values
        oof_pred = pred[pred['click_mode'].notnull()]

        # get best multiples
        m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred,
                             '../imp/multiple0_{}.png'.format(i + 1))
        pred['pred_0'] *= m0
        oof_pred['pred_0'] *= m0

        m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred,
                             '../imp/multiple3_{}.png'.format(i + 1))
        pred['pred_3'] *= m3
        oof_pred['pred_3'] *= m3

        m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred,
                             '../imp/multiple4_{}.png'.format(i + 1))
        pred['pred_4'] *= m4
        oof_pred['pred_4'] *= m4

        # get recommend mode
        pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1)

        # if number of plans = 1 and recommend mode != 0, fill recommend mode with plan 0 mode.
        pred['recommend_mode'][(pred['plan_num_plans'] == 1) & (
            pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][
                (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)]

        # split train & test
        _sub_pred = pred[pred['click_mode'].isnull()]
        _oof_pred = pred[pred['click_mode'].notnull()]

        sub_preds.append(_sub_pred)
        oof_preds.append(_oof_pred)

        del pred, _sub_pred, _oof_pred
        gc.collect()

    # merge preds
    sub_pred = sub_preds[0].append(sub_preds[1])
    sub_pred = sub_pred.append(sub_preds[2])
    sub_pred = pd.merge(
        plans[plans['click_mode'].isnull()][['sid', 'click_mode']],
        sub_pred[['sid', 'recommend_mode']],
        on='sid',
        how='left')

    oof_pred = oof_preds[0].append(oof_preds[1])
    oof_pred = oof_pred.append(oof_preds[2])
    oof_pred = pd.merge(
        plans[plans['click_mode'].notnull()][['sid', 'click_mode']],
        oof_pred[['sid', 'recommend_mode']],
        on='sid',
        how='left')

    del sub_preds, oof_preds, plans

    # out of fold score
    oof_f1_score = f1_score(oof_pred['click_mode'],
                            oof_pred['recommend_mode'],
                            average='weighted')

    # save csv
    oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                             index=False)
    sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                               index=False)

    # line notify
    line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))
Esempio n. 20
0
        folds=folds.split(TRAIN_DF[FEATS], TRAIN_DF['park_japanese_holiday']),
        num_boost_round=10000,  # early stopありなのでここは大きめの数字にしてます
        early_stopping_rounds=200,
        verbose_eval=100,
        seed=47,
    )
    gc.collect()
    return clf['rmse-mean'][-1]


if __name__ == '__main__':
    study = optuna.create_study()
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    # save result
    hist_df = study.trials_dataframe()
    hist_df.to_csv("../output/optuna_result_lgbm.csv")

    line_notify('optuna LightGBM finished.')
Esempio n. 21
0
def main(num_rows=None):
    # load pkls
    df = read_pickles('../features/plans')
    queries = loadpkl('../features/queries.pkl')
    profiles = loadpkl('../features/profiles.pkl')
    queries_pred = loadpkl('../features/queries_pred.pkl')
    queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl')

    # merge
    df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left')
    df = pd.merge(df, profiles, on='pid', how='left')
    df = pd.merge(df, queries_pred, on='sid', how='left')
    df = pd.merge(df, queries_profiles_pred, on='sid', how='left')

    del queries, profiles, queries_pred, queries_profiles_pred
    gc.collect()

    # reduce memory usage
    df = reduce_mem_usage(df)

    # count features
    df['pid_count'] = df['pid'].map(df['pid'].value_counts())

    # time diff
    df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int)

    # distance ratio
    cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)]

    for i, c in enumerate(cols_plan_distance):
        df['plan_queries_distance_ratio{}'.format(
            i)] = df[c] / df['queries_distance']
        df['plan_queries_distance_diff{}'.format(
            i)] = df[c] - df['queries_distance']

    # stats features for preds
    cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)]
    cols_pred_queries_profiles = [
        'pred_queries_profiles{}'.format(i) for i in range(0, 12)
    ]

    df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1)
    df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1)
    df['pred_queries_max'] = df[cols_pred_queries].max(axis=1)
    df['pred_queries_min'] = df[cols_pred_queries].min(axis=1)
    df['pred_queries_var'] = df[cols_pred_queries].var(axis=1)
    df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1)

    df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean(
        axis=1)
    df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum(
        axis=1)
    df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max(
        axis=1)
    df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min(
        axis=1)
    df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var(
        axis=1)
    df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew(
        axis=1)

    # stats features for each classes
    print('stats features...')
    for i in tqdm(range(0, 12)):
        cols = [
            'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i)
        ]
        df['pred_mean{}'.format(i)] = df[cols].mean(axis=1)
        df['pred_sum{}'.format(i)] = df[cols].sum(axis=1)
        df['pred_max{}'.format(i)] = df[cols].max(axis=1)
        df['pred_min{}'.format(i)] = df[cols].min(axis=1)
        df['pred_var{}'.format(i)] = df[cols].var(axis=1)
        df['pred_skew{}'.format(i)] = df[cols].skew(axis=1)

        cols_target = [c for c in df.columns if '_target_{}'.format(i) in c]
        df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1)
        df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1)
        df['target_max{}'.format(i)] = df[cols_target].max(axis=1)
        df['target_min{}'.format(i)] = df[cols_target].min(axis=1)
        df['target_var{}'.format(i)] = df[cols_target].var(axis=1)
        df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1)

    # post processing
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]
    print('post processing...')
    for i in tqdm(range(1, 12)):
        tmp = np.zeros(len(df))
        for c in cols_transport_mode:
            tmp += (df[c] == i).astype(int)

        cols_target = [c for c in df.columns if '_target_{}'.format(i) in c]
        for c in cols_target + [
                'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i)
        ]:
            df[c] = df[c] * (tmp > 0)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # split data by city
    df1 = df[df['y_o'] > 37.5]
    df2 = df[df['y_o'] < 27.5]
    df3 = df[df['x_o'] > 120.0]

    del df
    gc.collect()

    # cols for target encoding
    cols_target_encoding = [
        'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour',
        'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans',
        'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round',
        'queries_distance_round'
    ]

    cols_ratio_plan = [
        'plan_price_distance_ratio_max_plan',
        'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan',
        'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan',
        'plan_distance_eta_ratio_min_plan',
        'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan',
        'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan',
        'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan',
        'plan_price_distance_eta_prod_max_plan',
        'plan_price_distance_eta_prod_min_plan',
        'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan',
        'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan',
        'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan',
        'plan_price_distance_prod_ratio_0_max_plan',
        'plan_price_distance_prod_ratio_0_min_plan',
        'plan_price_eta_prod_ratio_0_max_plan',
        'plan_price_eta_prod_ratio_0_min_plan',
        'plan_distance_eta_prod_ratio_0_max_plan',
        'plan_distance_eta_prod_ratio_0_min_plan',
        'plan_price_distance_eta_prod_ratio_0_max_plan',
        'plan_price_distance_eta_prod_ratio_0_min_plan'
    ]

    cols_min_max_plan = [
        'plan_distance_max_plan', 'plan_distance_min_plan',
        'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan',
        'plan_eta_min_plan'
    ]

    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [
        'profile_k_means'
    ]

    # target encoding for each cities
    print('traget encoding...')
    for i, df in tqdm(enumerate([df1, df2, df3])):

        # target encoding
        df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding)

        # change dtype
        for col in df.columns.tolist():
            if df[col].dtypes == 'float16':
                df[col] = df[col].astype(np.float32)

        # remove missing variables
        col_missing = removeMissingVariables(df, 0.75)
        df.drop(col_missing, axis=1, inplace=True)

        # remove correlated variables
        col_drop = removeCorrelatedVariables(df, 0.95)
        df.drop(col_drop, axis=1, inplace=True)

        # save as feather
        to_feature(df, '../features/feats{}'.format(i + 1))

        # save feature name list
        features_json = {'features': df.columns.tolist()}
        to_json(features_json,
                '../features/00{}_all_features.json'.format(i + 1))

        del df
        gc.collect()

    line_notify('{} finished.'.format(sys.argv[0]))
def train_lightgbm(train_df, test_df, debug=False):
    print("Starting LightGBM. Train shape: {}".format(train_df.shape))

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # set data structure
    lgb_train = lgb.Dataset(train_df[feats],
                            label=train_df['demand'],
                            free_raw_data=False)

    params = {
        #           'device' : 'gpu',
        #           'gpu_use_dp':True,
        'task': 'train',
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.1,
        'bagging_fraction': 0.85,
        'bagging_freq': 1,
        'colsample_bytree': 0.85,
        'colsample_bynode': 0.85,
        'min_data_per_leaf': 25,
        'num_leaves': 200,
        'lambda_l1': 0.5,
        'lambda_l2': 0.5,
        'verbose': -1,
        'seed': 326,
        'bagging_seed': 326,
        'drop_seed': 326,
        #            'num_threads':-1
    }

    # train model
    reg = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train],
        verbose_eval=10,
        num_boost_round=configs['num_boost_round'],
    )

    # save model
    reg.save_model('../output/lgbm_diff.txt')

    # save predictions
    oof_preds += reg.predict(train_df[feats], num_iteration=reg.best_iteration)
    sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration)

    # save feature importances
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = np.log1p(
        reg.feature_importance(importance_type='gain',
                               iteration=reg.best_iteration))
    fold_importance_df["fold"] = 1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0)

    del reg
    gc.collect()

    # Full RMSE score and LINE Notify
    full_rmse = rmse(train_df['demand'], oof_preds)
    line_notify('Full RMSE score %.6f' % full_rmse)

    # display importances
    display_importances(feature_importance_df,
                        '../imp/lgbm_importances_diff.png',
                        '../imp/feature_importance_lgbm_diff.csv')

    if not debug:
        # save out of fold prediction
        train_df.loc[:, 'demand'] = oof_preds
        train_df = train_df.reset_index()
        train_df[['id', 'demand']].to_csv(oof_file_name, index=False)

        # reshape prediction for submit
        test_df.loc[:, 'demand'] = sub_preds
        test_df = test_df.reset_index()
        preds = test_df[['id', 'd', 'demand']].reset_index()
        preds = preds.pivot(index='id', columns='d',
                            values='demand').reset_index()

        # split test1 / test2
        preds1 = preds[['id'] + COLS_TEST1]
        preds2 = preds[['id'] + COLS_TEST2]

        # change column names
        preds1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
        preds2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

        # replace test2 id
        preds2['id'] = preds2['id'].str.replace('_validation', '_evaluation')

        # merge
        preds = preds1.append(preds2)

        # save csv
        preds.to_csv(submission_file_name, index=False)

        # submission by API


#        submit(submission_file_name, comment='model301 cv: %.6f' % full_rmse)

# LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Esempio n. 23
0
def main():
    print('load files...')
    # load submission files
    sub_28days = pd.read_csv(
        '../output/submission_lgbm_group_k_fold_28days.csv')
    sub_21days = pd.read_csv(
        '../output/submission_lgbm_group_k_fold_21days.csv')
    sub_14days = pd.read_csv(
        '../output/submission_lgbm_group_k_fold_14days.csv')
    sub_7days = pd.read_csv('../output/submission_lgbm_group_k_fold_7days.csv')

    # load out of fold files
    oof_28days = pd.read_csv('../output/oof_lgbm_group_k_fold_28days.csv')
    oof_21days = pd.read_csv('../output/oof_lgbm_group_k_fold_21days.csv')
    oof_14days = pd.read_csv('../output/oof_lgbm_group_k_fold_14days.csv')
    oof_7days = pd.read_csv('../output/oof_lgbm_group_k_fold_7days.csv')

    # to pivot
    print('to pivot...')
    sub_28days = sub_28days.pivot(index='id', columns='d',
                                  values='demand').reset_index()
    sub_21days = sub_21days.pivot(index='id', columns='d',
                                  values='demand').reset_index()
    sub_14days = sub_14days.pivot(index='id', columns='d',
                                  values='demand').reset_index()
    sub_7days = sub_7days.pivot(index='id', columns='d',
                                values='demand').reset_index()

    oof_28days = oof_28days.pivot(index='id', columns='d',
                                  values='demand').reset_index()
    oof_21days = oof_21days.pivot(index='id', columns='d',
                                  values='demand').reset_index()
    oof_14days = oof_14days.pivot(index='id', columns='d',
                                  values='demand').reset_index()
    oof_7days = oof_7days.pivot(index='id', columns='d',
                                values='demand').reset_index()

    # change columns name
    sub_28days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    sub_21days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    sub_14days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    sub_7days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

    # validation columns
    valid_col_28days_fold1 = [f'd_{i+1}' for i in range(1913 + 21, 1913 + 28)]
    valid_col_21days_fold1 = [f'd_{i+1}' for i in range(1913 + 14, 1913 + 21)]
    valid_col_14days_fold1 = [f'd_{i+1}' for i in range(1913 + 7, 1913 + 14)]
    valid_col_7days_fold1 = [f'd_{i+1}' for i in range(1913, 1913 + 7)]

    valid_col_28days_fold2 = [f'd_{i+1}' for i in range(1885 + 21, 1885 + 28)]
    valid_col_21days_fold2 = [f'd_{i+1}' for i in range(1885 + 14, 1885 + 21)]
    valid_col_14days_fold2 = [f'd_{i+1}' for i in range(1885 + 7, 1885 + 14)]
    valid_col_7days_fold2 = [f'd_{i+1}' for i in range(1885, 1885 + 7)]

    valid_col_28days_fold3 = [f'd_{i+1}' for i in range(1576 + 21, 1576 + 28)]
    valid_col_21days_fold3 = [f'd_{i+1}' for i in range(1576 + 14, 1576 + 21)]
    valid_col_14days_fold3 = [f'd_{i+1}' for i in range(1576 + 7, 1576 + 14)]
    valid_col_7days_fold3 = [f'd_{i+1}' for i in range(1576, 1576 + 7)]

    # merge oof files
    oof = oof_28days[['id'] + valid_col_28days_fold1].merge(
        oof_28days[['id'] + valid_col_28days_fold2], on='id', how='left')
    oof = oof.merge(oof_28days[['id'] + valid_col_28days_fold3],
                    on='id',
                    how='left')

    oof = oof.merge(oof_21days[['id'] + valid_col_21days_fold1],
                    on='id',
                    how='left')
    oof = oof.merge(oof_21days[['id'] + valid_col_21days_fold2],
                    on='id',
                    how='left')
    oof = oof.merge(oof_21days[['id'] + valid_col_21days_fold3],
                    on='id',
                    how='left')

    oof = oof.merge(oof_14days[['id'] + valid_col_14days_fold1],
                    on='id',
                    how='left')
    oof = oof.merge(oof_14days[['id'] + valid_col_14days_fold2],
                    on='id',
                    how='left')
    oof = oof.merge(oof_14days[['id'] + valid_col_14days_fold3],
                    on='id',
                    how='left')

    oof = oof.merge(oof_7days[['id'] + valid_col_7days_fold1],
                    on='id',
                    how='left')
    oof = oof.merge(oof_7days[['id'] + valid_col_7days_fold2],
                    on='id',
                    how='left')
    oof = oof.merge(oof_7days[['id'] + valid_col_7days_fold3],
                    on='id',
                    how='left')

    # split columns
    col_28days = [f'F{i+1}' for i in range(21, 28)]
    col_21days = [f'F{i+1}' for i in range(14, 21)]
    col_14days = [f'F{i+1}' for i in range(7, 14)]
    col_7days = [f'F{i+1}' for i in range(0, 7)]

    # merge
    sub = sub_7days[['id'] + col_7days].merge(sub_14days[['id'] + col_14days],
                                              on='id',
                                              how='left')
    sub = sub.merge(sub_21days[['id'] + col_21days], on='id', how='left')
    sub = sub.merge(sub_28days[['id'] + col_28days], on='id', how='left')

    # split test1 / test2
    sub1 = oof[['id'] + COLS_TEST1]
    sub2 = sub[['id'] + ['F' + str(d + 1) for d in range(28)]]

    # change column names
    sub1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

    # replace test1 id
    sub1['id'] = sub1['id'].str.replace('_evaluation', '_validation')

    # merge
    sub = sub1.append(sub2)

    # postprocesssing
    cols_f = [f'F{i}' for i in range(1, 29)]
    cols_d = [c for c in oof.columns if 'd_' in c]
    sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0)
    oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0)

    # calc out of fold WRMSSE score
    print('calc oof cv scores...')
    scores = calc_score_cv(oof)
    score = np.mean(scores)
    print(f'scores: {scores}')

    # save csv
    sub.to_csv(submission_file_name, index=False)
    oof.to_csv(oof_file_name_pivot, index=False)

    # submission by API
    #    submit(submission_file_name, comment='model409 cv: %.6f' % score)

    # LINE notify
    line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))
Esempio n. 24
0
def kfold_xgboost(df, num_folds, stratified = False, debug= False):

    # Divide in training/validation and test data
    train_df = df[df['visitors'].notnull()]
    test_df = df[df['visitors'].isnull()]

    print("Starting XGBoost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    # save pkl
    save2pkl('../output/train_df.pkl', train_df)
    save2pkl('../output/test_df.pkl', test_df)

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=47)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # final predict用にdmatrix形式のtest dfを作っておきます
    test_df_dmtrx = xgb.DMatrix(test_df[feats], label=train_df['visitors'])

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['park_japanese_holiday'])):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['visitors'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['visitors'].iloc[valid_idx])

        # set data structure
        xgb_train = xgb.DMatrix(train_x,
                                label=train_y)
        xgb_test = xgb.DMatrix(valid_x,
                               label=valid_y)

        # params
        params = {
                'objective':'gpu:reg:linear', # GPU parameter
                'booster': 'gbtree',
                'eval_metric':'rmse',
                'silent':1,
                'eta': 0.01,
                'max_depth': 8,
                'min_child_weight': 19,
                'gamma': 0.089444100759612,
                'subsample': 0.91842954303314,
                'colsample_bytree': 0.870658058238432,
                'colsample_bylevel': 0.995353255250289,
                'alpha':19.9615600411437,
                'lambda': 2.53962270252528,
                'tree_method': 'gpu_hist', # GPU parameter
                'predictor': 'gpu_predictor', # GPU parameter
                'seed':int(2**n_fold)
                }

        reg = xgb.train(
                        params,
                        xgb_train,
                        num_boost_round=10000,
                        evals=[(xgb_train,'train'),(xgb_test,'test')],
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        # save model
        reg.save_model('../output/xgb_'+str(n_fold)+'.txt')

        oof_preds[valid_idx] = np.expm1(reg.predict(xgb_test))
        sub_preds += np.expm1(reg.predict(test_df_dmtrx)) / num_folds

        fold_importance_df = pd.DataFrame.from_dict(reg.get_score(importance_type='gain'), orient='index', columns=['importance'])
        fold_importance_df["feature"] = fold_importance_df.index.tolist()
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    del test_df_dmtrx
    gc.collect()

    # Full MAEスコアの表示&LINE通知
    full_mae = mean_absolute_error(train_df['visitors'], oof_preds)
    line_notify('XGBoost Full MAE score %.6f' % full_mae)

    if not debug:
        # 提出データの予測値を保存
        test_df.loc[:,'visitors'] = sub_preds
        test_df[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t')

        # out of foldの予測値を保存
        train_df.loc[:,'OOF_PRED'] = oof_preds
        train_df[['index', 'OOF_PRED']].sort_values('index').to_csv(oof_file_name, index= False)

    return feature_importance_df
Esempio n. 25
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):

    # Divide in training/validation and test data
    train_df = df[df['visitors'].notnull()]
    test_df = df[df['visitors'].isnull()]

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()

    # save pkl
    save2pkl('../output/train_df.pkl', train_df)
    save2pkl('../output/test_df.pkl', test_df)

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=47)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=47)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['park_japanese_holiday'])):
        train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(
            train_df['visitors'].iloc[train_idx])
        valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(
            train_df['visitors'].iloc[valid_idx])

        # set data structure
        lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

        # パラメータは適当です
        params = {
            'device': 'gpu',
            'gpu_use_dp': True,
            'task': 'train',
            'boosting': 'goss',
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'num_leaves': 64,
            'colsample_bytree': 0.977334338875847,
            'subsample': 0.027687793278932,
            'max_depth': 20,
            'reg_alpha': 9.72886163508719,
            'reg_lambda': 9.9935502633216,
            'min_split_gain': 0.178508066955524,
            'min_child_weight': 43.4750700383884,
            'min_data_in_leaf': 18,
            'other_rate': 0.925113620582013,
            'top_rate': 0.006970683025472,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        reg = lgb.train(params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        reg.save_model('../output/lgbm_' + str(n_fold) + '.txt')

        oof_preds[valid_idx] = np.expm1(
            reg.predict(valid_x, num_iteration=reg.best_iteration))
        sub_preds += np.expm1(
            reg.predict(test_df[feats],
                        num_iteration=reg.best_iteration)) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            reg.feature_importance(importance_type='gain',
                                   iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d MAE : %.6f' %
              (n_fold + 1,
               mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full MAEスコアの表示&LINE通知
    full_mae = mean_absolute_error(train_df['visitors'], oof_preds)
    line_notify('LigntGBM Full MAE score %.6f' % full_mae)

    if not debug:
        # 提出データの予測値を保存
        test_df.loc[:, 'visitors'] = sub_preds
        test_df[['index',
                 'visitors']].sort_values('index').to_csv(submission_file_name,
                                                          index=False,
                                                          header=False,
                                                          sep='\t')

        # out of foldの予測値を保存
        train_df.loc[:, 'OOF_PRED'] = oof_preds
        train_df[['index',
                  'OOF_PRED']].sort_values('index').to_csv(oof_file_name,
                                                           index=False)

    return feature_importance_df
Esempio n. 26
0
    test_df,
    target_col=target_col,
    model_loss=loss_type,
    num_folds=folds,
    feats_exclude=feats_exclude,
    stratified=False,
    use_gpu=use_GPU)
"""
models, model_params, feature_importance_df, train_preds, test_preds, scores, model_name = kfold_xgb(
    train_df, test_df, target_col=target_col, model_loss=loss_type,
    num_folds=folds, feats_exclude=feats_exclude, stratified=False, use_gpu=use_GPU)
"""
# CVスコア
create_score_log(scores)
score = np.mean(np.array(scores))
line_notify('Full RMSE score %.6f' % score)


# submitファイルなどをまとめて保存します. ほんとはもっと疎結合にしてutilに置けるようにしたい...
def output(train_df, test_df, models, model_params, feature_importance_df,
           train_preds, test_preds, scores, now, model_name):
    score = sum(scores) / len(scores)
    folder_path = make_output_dir(score, now, model_name)
    for i, m in enumerate(models):
        save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m)
    with open('{0}/model_params.json'.format(folder_path), 'w') as f:
        json.dump(model_params, f, indent=4)
    with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f:
        json.dump({i: s for i, s in enumerate(scores)}, f, indent=4)
    save_importances(feature_importance_df,
                     '{}/importances.png'.format(folder_path),
def train_lightgbm(train_df,test_df):
    print('Starting LightGBM. Train shape: {}'.format(train_df.shape))

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # set data structure
    lgb_train = lgb.Dataset(train_df[feats],
                            label=train_df['demand'],
                            free_raw_data=False)

    params ={
#            'device' : 'gpu',
#            'gpu_use_dp':True,
            'boosting': 'gbdt',
            'metric': ['rmse'],
            'objective':'tweedie',
            'learning_rate': 0.05,
            'tweedie_variance_power':1.1,
            'subsample': 0.5,
            'subsample_freq': 1,
            'num_leaves': 2**8-1,
            'min_data_in_leaf': 2**8-1,
            'feature_fraction': 0.8,
            'verbose': -1,
            'seed':326,
            'bagging_seed':326,
            'drop_seed':326,
            'num_threads':-1
            }

    # train model
    reg = lgb.train(
                    params,
                    lgb_train,
                    valid_sets=[lgb_train],
                    verbose_eval=10,
                    num_boost_round = int(np.mean(configs['num_boost_round'])),
                    )

    # save model
    reg.save_model('../output/lgbm_weekday.txt')

    # save predictions
    oof_preds += reg.predict(train_df[feats], num_iteration=reg.best_iteration)
    sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration)

    # save feature importances
    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature'] = feats
    fold_importance_df['importance'] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
    fold_importance_df['fold'] = 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    del reg
    gc.collect()

    # Full RMSE score and LINE Notify
    full_rmse = rmse(train_df['demand'], oof_preds)
    line_notify('Full RMSE score %.6f' % full_rmse)

    # display importances
    display_importances(feature_importance_df,
                        '../imp/lgbm_importances_weekday.png',
                        '../imp/feature_importance_lgbm_weekday.csv')

    # save out of fold prediction
    train_df.loc[:,'demand'] = oof_preds
    train_df = train_df.reset_index()
    train_df[['id','d','demand']].to_csv(oof_file_name, index=False)

    # reshape prediction for submit
    test_df.loc[:,'demand'] = sub_preds
    test_df = test_df.reset_index()
    preds = test_df[['id','d','demand']].reset_index()

    # save csv
    preds.to_csv(submission_file_name, index=False)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
def main(is_eval=False):
    # load csv
    df = pd.read_csv('../input/calendar.csv')

    # to datetime
    df['date'] = pd.to_datetime(df['date'])

    # seasonality
    df['seasonality'] = np.cos(np.pi * (df['date'].dt.dayofyear / 366 * 2 - 1))

    # drop string columns
    df.drop('weekday', axis=1, inplace=True)

    df['day'] = df['date'].dt.day
    df['week'] = df['date'].dt.weekofyear
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['year'] = (df['year'] - df['year'].min())
    df['weekofmonth'] = df['day'].apply(lambda x: ceil(x / 7))

    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

    # features holiday
    df['date'] = df['date'].apply(lambda x: x.date())  # to date

    holidays_us = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(years=y).items():
            holidays_us.append(ptr[0])

    holidays_ca = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(state='CA', years=y).items():
            holidays_ca.append(ptr[0])

    holidays_tx = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(state='TX', years=y).items():
            holidays_tx.append(ptr[0])

    holidays_wi = []
    for y in range(2011, 2017):
        for ptr in holidays.UnitedStates(state='WI', years=y).items():
            holidays_wi.append(ptr[0])

    df['is_holiday_us'] = df['date'].apply(lambda x: 1
                                           if x in holidays_us else 0)
    df['is_holiday_ca'] = df['date'].apply(lambda x: 1
                                           if x in holidays_ca else 0)
    df['is_holiday_tx'] = df['date'].apply(lambda x: 1
                                           if x in holidays_tx else 0)
    df['is_holiday_wi'] = df['date'].apply(lambda x: 1
                                           if x in holidays_wi else 0)

    # preprocess event_name_1
    # to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Moon Phase
    df['moon'] = df['date'].apply(get_moon_phase)

    # add ramadan end dates
    ramadan_end_dates = [
        '2011-8-29', '2012-8-18', '2013-8-7', '2014-7-27', '2015-7-16',
        '2016-7-5'
    ]
    for d in ramadan_end_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Ramadan ends'

    # add Pesach start dates
    pesach_start_dates = [
        '2011-4-18', '2012-4-6', '2013-3-25', '2014-4-14', '2015-4-3',
        '2016-4-22'
    ]
    for d in pesach_start_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Pesach Start'

    # add purim start dates
    purim_start_dates = [
        '2011-3-19', '2012-3-7', '2013-2-23', '2014-3-15', '2015-3-4',
        '2016-3-23'
    ]
    for d in purim_start_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Purim Start'

    # add chanukah start dates
    chanukah_start_dates = [
        '2011-12-21', '2012-12-9', '2013-11-28', '2014-12-17', '2015-12-7',
        '2016-12-25'
    ]
    for d in chanukah_start_dates:
        df.loc[df['date'] == d, 'event_name_1'] = 'Chanukah Start'

    # add isin features
    is_nba_final = []
    is_lent = []
    is_ramadan = []
    is_pesach = []
    is_purim = []
    is_chanukah = []

    tmp_nba = 0
    tmp_lent = 0
    tmp_ramadan = 0
    tmp_pesach = 0
    tmp_purim = 0
    tmp_chanukah = 0

    for e in df['event_name_1']:
        if e == 'NBAFinalsStart':
            tmp_nba = 1
        is_nba_final.append(tmp_nba)
        if e == 'NBAFinalsEnd':
            tmp_nba = 0

        if e == 'LentStart':
            tmp_lent = 1
        is_lent.append(tmp_lent)
        if e == 'Easter':
            tmp_lent = 0

        if e == 'Ramadan starts':
            tmp_ramadan = 1
        is_ramadan.append(tmp_ramadan)
        if e == 'Ramadan ends':
            tmp_ramadan = 0

        if e == 'Pesach Start':
            tmp_pesach = 1
        is_pesach.append(tmp_pesach)
        if e == 'Pesach End':
            tmp_pesach = 0

        if e == 'Purim Start':
            tmp_purim = 1
        is_purim.append(tmp_purim)
        if e == 'Purim End':
            tmp_purim = 0

        if e == 'Chanukah Start':
            tmp_chanukah = 1
        is_chanukah.append(tmp_chanukah)
        if e == 'Chanukah End':
            tmp_chanukah = 0

    df['is_NBA_final'] = is_nba_final
    df['is_lent'] = is_lent
    df['is_ramadan'] = is_ramadan
    df['is_pesach'] = is_pesach
    df['is_purim'] = is_purim
    df['is_chanukah'] = is_chanukah

    # add blackfriday flag
    blackfriday_dates = [
        '2011-11-25', '2012-11-23', '2013-11-29', '2014-11-28', '2015-11-27'
    ]
    df['is_blackfriday'] = 0
    for d in blackfriday_dates:
        df.loc[df['date'] == d, 'is_blackfriday'] = 1

    # factorize numerical columns
    cols_string = [
        'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'
    ]
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1, np.nan, inplace=True)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save pkl
    save2pkl('../feats/calendar.pkl', df)

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
def kfold_lightgbm(train_df,
                   test_df,
                   num_folds,
                   stratified=False,
                   debug=False):

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0], 12))
    sub_preds = np.zeros((test_df.shape[0], 12))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['click_mode'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'click_mode'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'click_mode'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                categorical_feature=cat_cols,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               categorical_feature=cat_cols,
                               free_raw_data=False)

        # params
        params = {
            'device': 'gpu',
            'task': 'train',
            'boosting': 'gbdt',
            'objective': 'multiclass',
            'metric': 'multiclass',
            'learning_rate': 0.1,
            'num_class': 12,
            'colsample_bytree': 0.723387165617351,
            'max_depth': 8,
            'min_child_weight': 42.6805833563236,
            'min_data_in_leaf': 34,
            'min_split_gain': 0.010945157429729,
            'num_leaves': 48,
            'reg_alpha': 1.87287994755334,
            'reg_lambda': 4.8093341415383,
            'subsample': 0.483962708535824,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        clf = lgb.train(params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=100)

        # save model
        clf.save_model('../output/lgbm_queries_{}.txt'.format(n_fold))

        oof_preds[valid_idx] = clf.predict(valid_x,
                                           num_iteration=clf.best_iteration)
        sub_preds += clf.predict(
            test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            clf.feature_importance(importance_type='gain',
                                   iteration=clf.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d F1 Score : %.6f' %
              (n_fold + 1,
               f1_score(valid_y,
                        np.argmax(oof_preds[valid_idx], axis=1),
                        average='weighted')))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # Full F1 Score & LINE Notify
    full_f1 = f1_score(train_df['click_mode'],
                       np.argmax(oof_preds, axis=1),
                       average='weighted')
    line_notify('Full F1 Score %.6f' % full_f1)

    # display importances
    display_importances(feature_importance_df,
                        '../imp/lgbm_importances_queries_profiles.png',
                        '../imp/feature_importance_lgbm_queries_profiles.csv')

    if not debug:
        # save prediction for submit
        sub_preds = pd.DataFrame(sub_preds)
        sub_preds.columns = [
            'pred_queries_profiles{}'.format(c) for c in sub_preds.columns
        ]
        sub_preds['sid'] = test_df.index

        # save out of fold prediction
        oof_preds = pd.DataFrame(oof_preds)
        oof_preds.columns = [
            'pred_queries_profiles{}'.format(c) for c in oof_preds.columns
        ]
        oof_preds['sid'] = train_df.index

        # merge
        df = oof_preds.append(sub_preds)

        # save as pkl
        save2pkl('../features/queries_profiles_pred.pkl', df)

        line_notify('{} finished.'.format(sys.argv[0]))
Esempio n. 30
0
def main(num_rows=None):
    # load csv
    train_plans = pd.read_csv('../input/data_set_phase2/train_plans_phase2.csv',nrows=num_rows)
    test_plans = pd.read_csv('../input/data_set_phase2/test_plans.csv',nrows=num_rows)
    train_clicks = pd.read_csv('../input/data_set_phase2/train_clicks_phase2.csv')

    # phase 1 csv
    train_plans1 = pd.read_csv('../input/data_set_phase2/train_plans_phase1.csv')
    train_clicks1 = pd.read_csv('../input/data_set_phase2/train_clicks_phase1.csv')

    # merge click
    train_plans = pd.merge(train_plans, train_clicks[['sid','click_mode']], on='sid', how='left')
    train_plans1 = pd.merge(train_plans1, train_clicks1[['sid','click_mode']], on='sid', how='left')

    # merge phase 1 data
    train_plans = train_plans1.append(train_plans)

    # fill na (no click)
    train_plans['click_mode'].fillna(0, inplace=True)

    # set test target as nan
    test_plans['click_mode'] = np.nan

    # merge train & test
    plans = train_plans.append(test_plans)

    del train_plans, test_plans, train_plans1, train_clicks, train_clicks1
    gc.collect()

    # reset index
    plans.reset_index(inplace=True,drop=True)

    # convert json
    for key in tqdm(['distance', 'price', 'eta', 'transport_mode']):
        plans[key] = plans.plans.apply(lambda x: loadJSON(x,key))

    # flatten
    plans_df = [FlattenDataSimple(plans, key) for key in tqdm(['distance', 'price', 'eta', 'transport_mode'])]
    plans_df = pd.concat(plans_df,axis=1)

    # merge plan_time & click_mode
    plans_df = pd.merge(plans_df.reset_index(), plans[['sid','plan_time', 'click_mode']], on='sid',how='outer')

    del plans
    gc.collect()

    # reduce memory usage
    plans_df = reduce_mem_usage(plans_df)

    # cleaning
    for c in plans_df.columns.to_list():
        if 'price' in c:
            plans_df[c] = plans_df[c].replace('',0)

    plans_df['plan_time'] = pd.to_datetime(plans_df['plan_time'])

    # datetime features
    plans_df['plan_weekday'] = plans_df['plan_time'].dt.weekday
    plans_df['plan_hour'] = plans_df['plan_time'].dt.hour
    plans_df['plan_is_holiday'] = plans_df['plan_time'].apply(lambda x: is_holiday(x)).astype(int)
    plans_df['plan_weekday_hour'] = plans_df['plan_weekday'].astype(str)+'_'+plans_df['plan_hour'].astype(str)
    plans_df['plan_is_holiday_hour'] = plans_df['plan_is_holiday'].astype(str)+'_'+plans_df['plan_hour'].astype(str)
    plans_df['plan_time_diff'] = plans_df.index.map(plans_df.sort_values('plan_time')['plan_time'].diff().dt.seconds)

    # factorize
    plans_df['plan_weekday_hour'], _ = pd.factorize(plans_df['plan_weekday_hour'])
    plans_df['plan_is_holiday_hour'], _ = pd.factorize(plans_df['plan_is_holiday_hour'])

    # count features
    plans_df['plan_weekday_count'] = plans_df['plan_weekday'].map(plans_df['plan_weekday'].value_counts())
    plans_df['plan_hour_count'] = plans_df['plan_hour'].map(plans_df['plan_hour'].value_counts())
    plans_df['plan_weekday_hour_count'] = plans_df['plan_weekday_hour'].map(plans_df['plan_weekday_hour'].value_counts())
    plans_df['plan_is_holiday_hour_count'] = plans_df['plan_is_holiday_hour'].map(plans_df['plan_is_holiday_hour'].value_counts())

    # stats features
    cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
    cols_distance = ['plan_{}_distance'.format(i) for i in range(0,7)]
    cols_price = ['plan_{}_price'.format(i) for i in range(0,7)]
    cols_eta = ['plan_{}_eta'.format(i) for i in range(0,7)]

    plans_df['plan_distance_mean'] = plans_df[cols_distance].mean(axis=1)
    plans_df['plan_distance_sum'] = plans_df[cols_distance].sum(axis=1)
    plans_df['plan_distance_max'] = plans_df[cols_distance].max(axis=1)
    plans_df['plan_distance_min'] = plans_df[cols_distance].min(axis=1)
    plans_df['plan_distance_var'] = plans_df[cols_distance].var(axis=1)
    plans_df['plan_distance_skew'] = plans_df[cols_distance].skew(axis=1)

    plans_df['plan_price_mean'] = plans_df[cols_price].mean(axis=1)
    plans_df['plan_price_sum'] = plans_df[cols_price].sum(axis=1)
    plans_df['plan_price_max'] = plans_df[cols_price].max(axis=1)
    plans_df['plan_price_min'] = plans_df[cols_price].min(axis=1)
    plans_df['plan_price_var'] = plans_df[cols_price].var(axis=1)
    plans_df['plan_price_skew'] = plans_df[cols_price].skew(axis=1)

    plans_df['plan_eta_mean'] = plans_df[cols_eta].mean(axis=1)
    plans_df['plan_eta_sum'] = plans_df[cols_eta].sum(axis=1)
    plans_df['plan_eta_max'] = plans_df[cols_eta].max(axis=1)
    plans_df['plan_eta_min'] = plans_df[cols_eta].min(axis=1)
    plans_df['plan_eta_var'] = plans_df[cols_eta].var(axis=1)
    plans_df['plan_eta_skew'] = plans_df[cols_eta].skew(axis=1)

    # min-max plan (categorical)
    plans_df['plan_distance_max_plan'] = plans_df[cols_distance].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_min_plan'] = plans_df[cols_distance].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_max_plan'] = plans_df[cols_price].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_min_plan'] = plans_df[cols_price].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_eta_max_plan'] = plans_df[cols_eta].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_eta_min_plan'] = plans_df[cols_eta].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)

    # map plans
    cols_min_max_plan = ['plan_distance_max_plan','plan_distance_min_plan',
                         'plan_price_max_plan', 'plan_price_min_plan',
                         'plan_eta_max_plan', 'plan_eta_min_plan']
    for c in tqdm(cols_transport_mode):
        for p in cols_min_max_plan:
            plans_df[p][plans_df[p]==c] = plans_df[c][plans_df[p]==c]

    # count features
    plans_df['plan_distance_max_plan_count'] = plans_df['plan_distance_max_plan'].map(plans_df['plan_distance_max_plan'].value_counts())
    plans_df['plan_distance_min_plan_count'] = plans_df['plan_distance_min_plan'].map(plans_df['plan_distance_min_plan'].value_counts())
    plans_df['plan_price_max_plan_count'] = plans_df['plan_price_max_plan'].map(plans_df['plan_price_max_plan'].value_counts())
    plans_df['plan_price_min_plan_count'] = plans_df['plan_price_min_plan'].map(plans_df['plan_price_min_plan'].value_counts())
    plans_df['plan_eta_max_plan_count'] = plans_df['plan_eta_max_plan'].map(plans_df['plan_eta_max_plan'].value_counts())
    plans_df['plan_eta_min_plan_count'] = plans_df['plan_eta_min_plan'].map(plans_df['plan_eta_min_plan'].value_counts())

    # count features
    cols_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)]
    cols_mode_count = []
    for c in cols_mode:
        plans_df[c+'_count'] = plans_df[c].map(plans_df[c].value_counts())
        cols_mode_count.append(c+'_count')

    # number features
    plans_df['plan_num_plans'] = plans_df[cols_mode].notnull().sum(axis=1)
    plans_df['plan_num_free_plans'] = (plans_df[cols_price]==0).sum(axis=1)

    # rank features
    plans_df[[ c +'_rank' for c in cols_distance]] = plans_df[cols_distance].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price]] = plans_df[cols_price].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_eta]] = plans_df[cols_eta].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_mode_count]] = plans_df[cols_mode_count].rank(axis=1)

    # ratio features
    for i in range(0,7):
        plans_df['plan_{}_price_distance_ratio'.format(i)] = plans_df['plan_{}_price'.format(i)] / plans_df['plan_{}_distance'.format(i)]
        plans_df['plan_{}_price_eta_ratio'.format(i)] = plans_df['plan_{}_price'.format(i)] / plans_df['plan_{}_eta'.format(i)]
        plans_df['plan_{}_distance_eta_ratio'.format(i)] = plans_df['plan_{}_distance'.format(i)] / plans_df['plan_{}_eta'.format(i)]

    # prod features
    for i in range(0,7):
        plans_df['plan_{}_price_distance_prod'.format(i)] = plans_df['plan_{}_price'.format(i)] * plans_df['plan_{}_distance'.format(i)]
        plans_df['plan_{}_price_eta_prod'.format(i)] = plans_df['plan_{}_price'.format(i)] * plans_df['plan_{}_eta'.format(i)]
        plans_df['plan_{}_distance_eta_prod'.format(i)] = plans_df['plan_{}_distance'.format(i)] * plans_df['plan_{}_eta'.format(i)]
        plans_df['plan_{}_price_distance_eta_prod'.format(i)] = plans_df['plan_{}_price'.format(i)] * plans_df['plan_{}_distance'.format(i)]* plans_df['plan_{}_eta'.format(i)]

    # ratio features with plan 0
    for i in range(1,7):
        plans_df['plan_{}_distance_ratio_0'.format(i)] = plans_df['plan_{}_distance'.format(i)]/plans_df['plan_0_distance']
        plans_df['plan_{}_price_ratio_0'.format(i)] = plans_df['plan_{}_price'.format(i)]/plans_df['plan_0_price']
        plans_df['plan_{}_eta_ratio_0'.format(i)] = plans_df['plan_{}_eta'.format(i)]/plans_df['plan_0_eta']

        plans_df['plan_{}_price_distance_prod_ratio_0'.format(i)] = plans_df['plan_{}_price_distance_prod'.format(i)] / plans_df['plan_0_price_distance_prod']
        plans_df['plan_{}_price_eta_prod_ratio_0'.format(i)] = plans_df['plan_{}_price_eta_prod'.format(i)] / plans_df['plan_0_price_eta_prod']
        plans_df['plan_{}_distance_eta_prod_ratio_0'.format(i)] = plans_df['plan_{}_distance_eta_prod'.format(i)] / plans_df['plan_0_distance_eta_prod']
        plans_df['plan_{}_price_distance_eta_prod_ratio_0'.format(i)] = plans_df['plan_{}_price_distance_eta_prod'.format(i)] / plans_df['plan_0_price_distance_eta_prod']

    # stats features of ratio
    cols_price_distance_ratio = ['plan_{}_price_distance_ratio'.format(i) for i in range(0,7)]
    cols_price_eta_ratio = ['plan_{}_price_eta_ratio'.format(i) for i in range(0,7)]
    cols_distance_eta_ratio = ['plan_{}_distance_eta_ratio'.format(i) for i in range(0,7)]

    cols_price_distance_prod = ['plan_{}_price_distance_prod'.format(i) for i in range(0,7)]
    cols_price_eta_prod = ['plan_{}_price_eta_prod'.format(i) for i in range(0,7)]
    cols_distance_eta_prod = ['plan_{}_distance_eta_prod'.format(i) for i in range(0,7)]
    cols_price_distance_eta_prod = ['plan_{}_price_distance_eta_prod'.format(i) for i in range(0,7)]

    cols_distance_ratio_0 = ['plan_{}_distance_ratio_0'.format(i) for i in range(1,7)]
    cols_price_ratio_0 = ['plan_{}_price_ratio_0'.format(i) for i in range(1,7)]
    cols_eta_ratio_0 = ['plan_{}_eta_ratio_0'.format(i) for i in range(1,7)]

    cols_price_distance_prod_ratio_0 = ['plan_{}_price_distance_prod_ratio_0'.format(i) for i in range(1,7)]
    cols_price_eta_prod_ratio_0 = ['plan_{}_price_eta_prod_ratio_0'.format(i) for i in range(1,7)]
    cols_distance_eta_prod_ratio_0 = ['plan_{}_distance_eta_prod_ratio_0'.format(i) for i in range(1,7)]
    cols_price_distance_eta_prod_ratio_0 = ['plan_{}_price_distance_eta_prod_ratio_0'.format(i) for i in range(1,7)]

    plans_df['plan_price_distance_ratio_mean'] = plans_df[cols_price_distance_ratio].mean(axis=1)
    plans_df['plan_price_distance_ratio_sum'] = plans_df[cols_price_distance_ratio].sum(axis=1)
    plans_df['plan_price_distance_ratio_max'] = plans_df[cols_price_distance_ratio].max(axis=1)
    plans_df['plan_price_distance_ratio_min'] = plans_df[cols_price_distance_ratio].min(axis=1)
    plans_df['plan_price_distance_ratio_var'] = plans_df[cols_price_distance_ratio].var(axis=1)
    plans_df['plan_price_distance_ratio_skew'] = plans_df[cols_price_distance_ratio].skew(axis=1)

    plans_df['plan_price_eta_ratio_mean'] = plans_df[cols_price_eta_ratio].mean(axis=1)
    plans_df['plan_price_eta_ratio_sum'] = plans_df[cols_price_eta_ratio].sum(axis=1)
    plans_df['plan_price_eta_ratio_max'] = plans_df[cols_price_eta_ratio].max(axis=1)
    plans_df['plan_price_eta_ratio_min'] = plans_df[cols_price_eta_ratio].min(axis=1)
    plans_df['plan_price_eta_ratio_var'] = plans_df[cols_price_eta_ratio].var(axis=1)
    plans_df['plan_price_eta_ratio_skew'] = plans_df[cols_price_eta_ratio].skew(axis=1)

    plans_df['plan_distance_eta_ratio_mean'] = plans_df[cols_distance_eta_ratio].mean(axis=1)
    plans_df['plan_distance_eta_ratio_sum'] = plans_df[cols_distance_eta_ratio].sum(axis=1)
    plans_df['plan_distance_eta_ratio_max'] = plans_df[cols_distance_eta_ratio].max(axis=1)
    plans_df['plan_distance_eta_ratio_min'] = plans_df[cols_distance_eta_ratio].min(axis=1)
    plans_df['plan_distance_eta_ratio_var'] = plans_df[cols_distance_eta_ratio].var(axis=1)
    plans_df['plan_distance_eta_ratio_skew'] = plans_df[cols_distance_eta_ratio].skew(axis=1)

    plans_df['plan_price_distance_prod_mean'] = plans_df[cols_price_distance_prod].mean(axis=1)
    plans_df['plan_price_distance_prod_sum'] = plans_df[cols_price_distance_prod].sum(axis=1)
    plans_df['plan_price_distance_prod_max'] = plans_df[cols_price_distance_prod].max(axis=1)
    plans_df['plan_price_distance_prod_min'] = plans_df[cols_price_distance_prod].min(axis=1)
    plans_df['plan_price_distance_prod_var'] = plans_df[cols_price_distance_prod].var(axis=1)
    plans_df['plan_price_distance_prod_skew'] = plans_df[cols_price_distance_prod].skew(axis=1)

    plans_df['plan_price_eta_prod_mean'] = plans_df[cols_price_eta_prod].mean(axis=1)
    plans_df['plan_price_eta_prod_sum'] = plans_df[cols_price_eta_prod].sum(axis=1)
    plans_df['plan_price_eta_prod_max'] = plans_df[cols_price_eta_prod].max(axis=1)
    plans_df['plan_price_eta_prod_min'] = plans_df[cols_price_eta_prod].min(axis=1)
    plans_df['plan_price_eta_prod_var'] = plans_df[cols_price_eta_prod].var(axis=1)
    plans_df['plan_price_eta_prod_skew'] = plans_df[cols_price_eta_prod].skew(axis=1)

    plans_df['plan_distance_eta_prod_mean'] = plans_df[cols_distance_eta_prod].mean(axis=1)
    plans_df['plan_distance_eta_prod_sum'] = plans_df[cols_distance_eta_prod].sum(axis=1)
    plans_df['plan_distance_eta_prod_max'] = plans_df[cols_distance_eta_prod].max(axis=1)
    plans_df['plan_distance_eta_prod_min'] = plans_df[cols_distance_eta_prod].min(axis=1)
    plans_df['plan_distance_eta_prod_var'] = plans_df[cols_distance_eta_prod].var(axis=1)
    plans_df['plan_distance_eta_prod_skew'] = plans_df[cols_distance_eta_prod].skew(axis=1)

    plans_df['plan_price_distance_eta_prod_mean'] = plans_df[cols_price_distance_eta_prod].mean(axis=1)
    plans_df['plan_price_distance_eta_prod_sum'] = plans_df[cols_price_distance_eta_prod].sum(axis=1)
    plans_df['plan_price_distance_eta_prod_max'] = plans_df[cols_price_distance_eta_prod].max(axis=1)
    plans_df['plan_price_distance_eta_prod_min'] = plans_df[cols_price_distance_eta_prod].min(axis=1)
    plans_df['plan_price_distance_eta_prod_var'] = plans_df[cols_price_distance_eta_prod].var(axis=1)
    plans_df['plan_price_distance_eta_prod_skew'] = plans_df[cols_price_distance_eta_prod].skew(axis=1)

    plans_df['plan_distance_ratio_0_mean'] = plans_df[cols_distance_ratio_0].mean(axis=1)
    plans_df['plan_distance_ratio_0_sum'] = plans_df[cols_distance_ratio_0].sum(axis=1)
    plans_df['plan_distance_ratio_0_max'] = plans_df[cols_distance_ratio_0].max(axis=1)
    plans_df['plan_distance_ratio_0_min'] = plans_df[cols_distance_ratio_0].min(axis=1)
    plans_df['plan_distance_ratio_0_var'] = plans_df[cols_distance_ratio_0].var(axis=1)
    plans_df['plan_distance_ratio_0_skew'] = plans_df[cols_distance_ratio_0].skew(axis=1)

    plans_df['plan_price_ratio_0_mean'] = plans_df[cols_price_ratio_0].mean(axis=1)
    plans_df['plan_price_ratio_0_sum'] = plans_df[cols_price_ratio_0].sum(axis=1)
    plans_df['plan_price_ratio_0_max'] = plans_df[cols_price_ratio_0].max(axis=1)
    plans_df['plan_price_ratio_0_min'] = plans_df[cols_price_ratio_0].min(axis=1)
    plans_df['plan_price_ratio_0_var'] = plans_df[cols_price_ratio_0].var(axis=1)
    plans_df['plan_price_ratio_0_skew'] = plans_df[cols_price_ratio_0].skew(axis=1)

    plans_df['plan_eta_ratio_0_mean'] = plans_df[cols_eta_ratio_0].mean(axis=1)
    plans_df['plan_eta_ratio_0_sum'] = plans_df[cols_eta_ratio_0].sum(axis=1)
    plans_df['plan_eta_ratio_0_max'] = plans_df[cols_eta_ratio_0].max(axis=1)
    plans_df['plan_eta_ratio_0_min'] = plans_df[cols_eta_ratio_0].min(axis=1)
    plans_df['plan_eta_ratio_0_var'] = plans_df[cols_eta_ratio_0].var(axis=1)
    plans_df['plan_eta_ratio_0_skew'] = plans_df[cols_eta_ratio_0].skew(axis=1)

    plans_df['plan_price_distance_prod_ratio_0_mean'] = plans_df[cols_price_distance_prod_ratio_0].mean(axis=1)
    plans_df['plan_price_distance_prod_ratio_0_sum'] = plans_df[cols_price_distance_prod_ratio_0].sum(axis=1)
    plans_df['plan_price_distance_prod_ratio_0_max'] = plans_df[cols_price_distance_prod_ratio_0].max(axis=1)
    plans_df['plan_price_distance_prod_ratio_0_min'] = plans_df[cols_price_distance_prod_ratio_0].min(axis=1)
    plans_df['plan_price_distance_prod_ratio_0_var'] = plans_df[cols_price_distance_prod_ratio_0].var(axis=1)
    plans_df['plan_price_distance_prod_ratio_0_skew'] = plans_df[cols_price_distance_prod_ratio_0].skew(axis=1)

    plans_df['plan_price_eta_prod_ratio_0_mean'] = plans_df[cols_price_eta_prod_ratio_0].mean(axis=1)
    plans_df['plan_price_eta_prod_ratio_0_sum'] = plans_df[cols_price_eta_prod_ratio_0].sum(axis=1)
    plans_df['plan_price_eta_prod_ratio_0_max'] = plans_df[cols_price_eta_prod_ratio_0].max(axis=1)
    plans_df['plan_price_eta_prod_ratio_0_min'] = plans_df[cols_price_eta_prod_ratio_0].min(axis=1)
    plans_df['plan_price_eta_prod_ratio_0_var'] = plans_df[cols_price_eta_prod_ratio_0].var(axis=1)
    plans_df['plan_price_eta_prod_ratio_0_skew'] = plans_df[cols_price_eta_prod_ratio_0].skew(axis=1)

    plans_df['plan_distance_eta_prod_ratio_0_mean'] = plans_df[cols_distance_eta_prod_ratio_0].mean(axis=1)
    plans_df['plan_distance_eta_prod_ratio_0_sum'] = plans_df[cols_distance_eta_prod_ratio_0].sum(axis=1)
    plans_df['plan_distance_eta_prod_ratio_0_max'] = plans_df[cols_distance_eta_prod_ratio_0].max(axis=1)
    plans_df['plan_distance_eta_prod_ratio_0_min'] = plans_df[cols_distance_eta_prod_ratio_0].min(axis=1)
    plans_df['plan_distance_eta_prod_ratio_0_var'] = plans_df[cols_distance_eta_prod_ratio_0].var(axis=1)
    plans_df['plan_distance_eta_prod_ratio_0_skew'] = plans_df[cols_distance_eta_prod_ratio_0].skew(axis=1)

    plans_df['plan_price_distance_eta_prod_ratio_0_mean'] = plans_df[cols_price_distance_eta_prod_ratio_0].mean(axis=1)
    plans_df['plan_price_distance_eta_prod_ratio_0_sum'] = plans_df[cols_price_distance_eta_prod_ratio_0].sum(axis=1)
    plans_df['plan_price_distance_eta_prod_ratio_0_max'] = plans_df[cols_price_distance_eta_prod_ratio_0].max(axis=1)
    plans_df['plan_price_distance_eta_prod_ratio_0_min'] = plans_df[cols_price_distance_eta_prod_ratio_0].min(axis=1)
    plans_df['plan_price_distance_eta_prod_ratio_0_var'] = plans_df[cols_price_distance_eta_prod_ratio_0].var(axis=1)
    plans_df['plan_price_distance_eta_prod_ratio_0_skew'] = plans_df[cols_price_distance_eta_prod_ratio_0].skew(axis=1)

    # rank features
    plans_df[[ c +'_rank' for c in cols_price_distance_ratio]] = plans_df[cols_price_distance_ratio].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price_eta_ratio]] = plans_df[cols_price_eta_ratio].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_distance_eta_ratio]] = plans_df[cols_distance_eta_ratio].rank(axis=1)

    plans_df[[ c +'_rank' for c in cols_price_distance_prod]] = plans_df[cols_price_distance_prod].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price_eta_prod]] = plans_df[cols_price_eta_prod].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_distance_eta_prod]] = plans_df[cols_distance_eta_prod].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price_distance_eta_prod]] = plans_df[cols_price_distance_eta_prod].rank(axis=1)

    plans_df[[ c +'_rank' for c in cols_distance_ratio_0]] = plans_df[cols_distance_ratio_0].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price_ratio_0]] = plans_df[cols_price_ratio_0].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_eta_ratio_0]] = plans_df[cols_eta_ratio_0].rank(axis=1)

    plans_df[[ c +'_rank' for c in cols_price_distance_prod_ratio_0]] = plans_df[cols_price_distance_prod_ratio_0].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price_eta_prod_ratio_0]] = plans_df[cols_price_eta_prod_ratio_0].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_distance_eta_prod_ratio_0]] = plans_df[cols_distance_eta_prod_ratio_0].rank(axis=1)
    plans_df[[ c +'_rank' for c in cols_price_distance_eta_prod_ratio_0]] = plans_df[cols_price_distance_eta_prod_ratio_0].rank(axis=1)

    # min-max plan (categorical) for ratio features
    plans_df['plan_price_distance_ratio_max_plan'] = plans_df[cols_price_distance_ratio].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_eta_ratio_max_plan'] = plans_df[cols_price_eta_ratio].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode')
    plans_df['plan_price_distance_ratio_min_plan'] = plans_df[cols_price_distance_ratio].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_eta_ratio_min_plan'] = plans_df[cols_price_eta_ratio].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_eta_ratio_max_plan'] = plans_df[cols_distance_eta_ratio].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_eta_ratio_min_plan'] = plans_df[cols_distance_eta_ratio].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)

    plans_df['plan_price_distance_prod_max_plan'] = plans_df[cols_price_distance_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_eta_prod_max_plan'] = plans_df[cols_price_eta_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode')
    plans_df['plan_price_distance_prod_min_plan'] = plans_df[cols_price_distance_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_eta_prod_min_plan'] = plans_df[cols_price_eta_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_eta_prod_max_plan'] = plans_df[cols_distance_eta_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_eta_prod_min_plan'] = plans_df[cols_distance_eta_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_distance_eta_prod_max_plan'] = plans_df[cols_distance_eta_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_distance_eta_prod_min_plan'] = plans_df[cols_distance_eta_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)

    plans_df['plan_distance_ratio_0_max_plan'] = plans_df[cols_distance_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_ratio_0_min_plan'] = plans_df[cols_distance_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_ratio_0_max_plan'] = plans_df[cols_price_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_ratio_0_min_plan'] = plans_df[cols_price_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_eta_ratio_0_max_plan'] = plans_df[cols_eta_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_eta_ratio_0_min_plan'] = plans_df[cols_eta_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)

    plans_df['plan_price_distance_prod_ratio_0_max_plan'] = plans_df[cols_price_distance_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_distance_prod_ratio_0_min_plan'] = plans_df[cols_price_distance_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_eta_prod_ratio_0_max_plan'] = plans_df[cols_price_eta_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_eta_prod_ratio_0_min_plan'] = plans_df[cols_price_eta_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_eta_prod_ratio_0_max_plan'] = plans_df[cols_distance_eta_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_distance_eta_prod_ratio_0_min_plan'] = plans_df[cols_distance_eta_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_distance_eta_prod_ratio_0_max_plan'] = plans_df[cols_price_distance_eta_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)
    plans_df['plan_price_distance_eta_prod_ratio_0_min_plan'] = plans_df[cols_price_distance_eta_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan)

    # map plans
    cols_ratio_plan = ['plan_price_distance_ratio_max_plan','plan_price_distance_ratio_min_plan',
                       'plan_price_eta_ratio_max_plan','plan_price_eta_ratio_min_plan',
                       'plan_distance_eta_ratio_max_plan', 'plan_distance_eta_ratio_min_plan',
                       'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan',
                       'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan',
                       'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan',
                       'plan_price_distance_eta_prod_max_plan', 'plan_price_distance_eta_prod_min_plan',
                       'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan',
                       'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan',
                       'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan',
                       'plan_price_distance_prod_ratio_0_max_plan','plan_price_distance_prod_ratio_0_min_plan',
                       'plan_price_eta_prod_ratio_0_max_plan','plan_price_eta_prod_ratio_0_min_plan',
                       'plan_distance_eta_prod_ratio_0_max_plan', 'plan_distance_eta_prod_ratio_0_min_plan',
                       'plan_price_distance_eta_prod_ratio_0_max_plan','plan_price_distance_eta_prod_ratio_0_min_plan']

    for p in tqdm(cols_ratio_plan):
        for c in cols_transport_mode:
            plans_df[p][plans_df[p]==c] = plans_df[c][plans_df[p]==c]

    # count features
    plans_df['plan_price_distance_ratio_max_plan_count'] = plans_df['plan_price_distance_ratio_max_plan'].map(plans_df['plan_price_distance_ratio_max_plan'].value_counts())
    plans_df['plan_price_distance_ratio_min_plan_count'] = plans_df['plan_price_distance_ratio_min_plan'].map(plans_df['plan_price_distance_ratio_min_plan'].value_counts())
    plans_df['plan_price_eta_ratio_max_plan_count'] = plans_df['plan_price_eta_ratio_max_plan'].map(plans_df['plan_price_eta_ratio_max_plan'].value_counts())
    plans_df['plan_price_eta_ratio_min_plan_count'] = plans_df['plan_price_eta_ratio_min_plan'].map(plans_df['plan_price_eta_ratio_min_plan'].value_counts())
    plans_df['plan_distance_eta_ratio_max_plan_count'] = plans_df['plan_distance_eta_ratio_max_plan'].map(plans_df['plan_distance_eta_ratio_max_plan'].value_counts())
    plans_df['plan_distance_eta_ratio_min_plan_count'] = plans_df['plan_distance_eta_ratio_min_plan'].map(plans_df['plan_distance_eta_ratio_min_plan'].value_counts())

    plans_df['plan_price_distance_prod_max_plan_count'] = plans_df['plan_price_distance_prod_max_plan'].map(plans_df['plan_price_distance_prod_max_plan'].value_counts())
    plans_df['plan_price_distance_prod_min_plan_count'] = plans_df['plan_price_distance_prod_min_plan'].map(plans_df['plan_price_distance_prod_min_plan'].value_counts())
    plans_df['plan_price_eta_prod_max_plan_count'] = plans_df['plan_price_eta_prod_max_plan'].map(plans_df['plan_price_eta_prod_max_plan'].value_counts())
    plans_df['plan_price_eta_prod_min_plan_count'] = plans_df['plan_price_eta_prod_min_plan'].map(plans_df['plan_price_eta_prod_min_plan'].value_counts())
    plans_df['plan_distance_eta_prod_max_plan_count'] = plans_df['plan_distance_eta_prod_max_plan'].map(plans_df['plan_distance_eta_prod_max_plan'].value_counts())
    plans_df['plan_distance_eta_prod_min_plan_count'] = plans_df['plan_distance_eta_prod_min_plan'].map(plans_df['plan_distance_eta_prod_min_plan'].value_counts())
    plans_df['plan_price_distance_eta_prod_max_plan_count'] = plans_df['plan_price_distance_eta_prod_max_plan'].map(plans_df['plan_price_distance_eta_prod_max_plan'].value_counts())
    plans_df['plan_price_distance_eta_prod_min_plan_count'] = plans_df['plan_price_distance_eta_prod_min_plan'].map(plans_df['plan_price_distance_eta_prod_min_plan'].value_counts())

    plans_df['plan_distance_ratio_0_max_plan_count'] = plans_df['plan_distance_ratio_0_max_plan'].map(plans_df['plan_distance_ratio_0_max_plan'].value_counts())
    plans_df['plan_distance_ratio_0_min_plan_count'] = plans_df['plan_distance_ratio_0_min_plan'].map(plans_df['plan_distance_ratio_0_min_plan'].value_counts())
    plans_df['plan_price_ratio_0_max_plan_count'] = plans_df['plan_price_ratio_0_max_plan'].map(plans_df['plan_price_ratio_0_max_plan'].value_counts())
    plans_df['plan_price_ratio_0_min_plan_count'] = plans_df['plan_price_ratio_0_min_plan'].map(plans_df['plan_price_ratio_0_min_plan'].value_counts())
    plans_df['plan_eta_ratio_0_max_plan_count'] = plans_df['plan_eta_ratio_0_max_plan'].map(plans_df['plan_eta_ratio_0_max_plan'].value_counts())
    plans_df['plan_eta_ratio_0_min_plan_count'] = plans_df['plan_eta_ratio_0_min_plan'].map(plans_df['plan_eta_ratio_0_min_plan'].value_counts())

    plans_df['plan_price_distance_prod_ratio_0_max_plan_count'] = plans_df['plan_price_distance_prod_ratio_0_max_plan'].map(plans_df['plan_price_distance_prod_ratio_0_max_plan'].value_counts())
    plans_df['plan_price_distance_prod_ratio_0_min_plan_count'] = plans_df['plan_price_distance_prod_ratio_0_min_plan'].map(plans_df['plan_price_distance_prod_ratio_0_min_plan'].value_counts())
    plans_df['plan_price_eta_prod_ratio_0_max_plan_count'] = plans_df['plan_price_eta_prod_ratio_0_max_plan'].map(plans_df['plan_price_eta_prod_ratio_0_max_plan'].value_counts())
    plans_df['plan_price_eta_prod_ratio_0_min_plan_count'] = plans_df['plan_price_eta_prod_ratio_0_min_plan'].map(plans_df['plan_price_eta_prod_ratio_0_min_plan'].value_counts())
    plans_df['plan_distance_eta_prod_ratio_0_max_plan_count'] = plans_df['plan_distance_eta_prod_ratio_0_max_plan'].map(plans_df['plan_distance_eta_prod_ratio_0_max_plan'].value_counts())
    plans_df['plan_distance_eta_prod_ratio_0_min_plan_count'] = plans_df['plan_distance_eta_prod_ratio_0_min_plan'].map(plans_df['plan_distance_eta_prod_ratio_0_min_plan'].value_counts())
    plans_df['plan_price_distance_eta_prod_ratio_0_max_plan_count'] = plans_df['plan_price_distance_eta_prod_ratio_0_max_plan'].map(plans_df['plan_price_distance_eta_prod_ratio_0_max_plan'].value_counts())
    plans_df['plan_price_distance_eta_prod_ratio_0_min_plan_count'] = plans_df['plan_price_distance_eta_prod_ratio_0_min_plan'].map(plans_df['plan_price_distance_eta_prod_ratio_0_min_plan'].value_counts())

    # save as pkl
    to_pickles(plans_df, '../features/plans', split_size=5)

    line_notify('{} finished.'.format(sys.argv[0]))