コード例 #1
0
def main():
    # load pkls
    df = read_pickles('../feats/sales_diff')
    df_calendar = loadpkl('../feats/calendar.pkl')
    df_sell_prices = loadpkl('../feats/sell_prices.pkl')

    # merge
    df = df.merge(df_calendar, on='d',how='left')
    df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left')

    del df_calendar, df_sell_prices
    gc.collect()

    # drop pre-release rows
    df = df[df['wm_yr_wk']>=df['release']]

    # make lag features
    df = make_lags(df,28)

    # label encoding
    cols_string = ['item_id','dept_id','cat_id','store_id','state_id']
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1,np.nan,inplace=True)

    # add price features
    df_grouped = df[['id','sell_price']].groupby('id')['sell_price']
    df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1))
    df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1'])
    df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max())
    df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365'])
    df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std())
    df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std())

    # features release date
    df['release'] = df['release'] - df['release'].min()

    # price momentum by month & year
    df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

    # days for CustomTimeSeriesSplitter
    df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save as feather
    to_feature(df, '../feats/f105')

    # save feature name list
    features_json = {'features':df.columns.tolist()}
    to_json(features_json,'../configs/105_all_features_diff.json')

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
コード例 #2
0
def main():
    # load feathers
    files = sorted(glob('../feats/*.feather'))
    df = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)],
                   axis=1)
    df = df[configs['features']]
    feats = [f for f in df.columns if f not in FEATS_EXCLUDED]

    # load model
    reg = lgb.Booster(model_file='../output/lgbm_all_data.txt')

    # Recursive prediction
    print('Recursive prediction...')
    for day in tqdm(range(1914, 1914 + 28)):
        mask_test = (df['d_numeric'] >= day - 28) & (df['d_numeric'] <= day)
        tmp_df = df[mask_test]
        tmp_df = make_lags(tmp_df)
        df.loc[df['d_numeric'] == day, 'demand'] = reg.predict(
            tmp_df[tmp_df['d_numeric'] == day][feats],
            num_iteration=reg.best_iteration)

        del tmp_df
        gc.collect()

    # split test
    test_df = df[df['date'] >= '2016-04-25']

    del df
    gc.collect()

    # reshape prediction for submit
    preds = test_df[['id', 'd', 'demand']].reset_index()
    preds = preds.pivot(index='id', columns='d', values='demand').reset_index()

    # split test1 / test2
    preds1 = preds[['id'] + COLS_TEST1]
    preds2 = preds[['id'] + COLS_TEST2]

    # change column names
    preds1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]
    preds2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)]

    # replace test2 id
    preds2['id'] = preds2['id'].str.replace('_validation', '_evaluation')

    # merge
    preds = preds1.append(preds2)

    # save csv
    preds.to_csv(submission_file_name, index=False)

    # submission by API
    submit(submission_file_name, comment='model301 recursive prediction')