def main(): # load pkls df = read_pickles('../feats/sales_diff') df_calendar = loadpkl('../feats/calendar.pkl') df_sell_prices = loadpkl('../feats/sell_prices.pkl') # merge df = df.merge(df_calendar, on='d',how='left') df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left') del df_calendar, df_sell_prices gc.collect() # drop pre-release rows df = df[df['wm_yr_wk']>=df['release']] # make lag features df = make_lags(df,28) # label encoding cols_string = ['item_id','dept_id','cat_id','store_id','state_id'] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1,np.nan,inplace=True) # add price features df_grouped = df[['id','sell_price']].groupby('id')['sell_price'] df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1)) df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1']) df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max()) df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365']) df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std()) df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std()) # features release date df['release'] = df['release'] - df['release'].min() # price momentum by month & year df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean') df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean') # days for CustomTimeSeriesSplitter df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int) # reduce memory usage df = reduce_mem_usage(df) # save as feather to_feature(df, '../feats/f105') # save feature name list features_json = {'features':df.columns.tolist()} to_json(features_json,'../configs/105_all_features_diff.json') # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(): # load feathers files = sorted(glob('../feats/*.feather')) df = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) df = df[configs['features']] feats = [f for f in df.columns if f not in FEATS_EXCLUDED] # load model reg = lgb.Booster(model_file='../output/lgbm_all_data.txt') # Recursive prediction print('Recursive prediction...') for day in tqdm(range(1914, 1914 + 28)): mask_test = (df['d_numeric'] >= day - 28) & (df['d_numeric'] <= day) tmp_df = df[mask_test] tmp_df = make_lags(tmp_df) df.loc[df['d_numeric'] == day, 'demand'] = reg.predict( tmp_df[tmp_df['d_numeric'] == day][feats], num_iteration=reg.best_iteration) del tmp_df gc.collect() # split test test_df = df[df['date'] >= '2016-04-25'] del df gc.collect() # reshape prediction for submit preds = test_df[['id', 'd', 'demand']].reset_index() preds = preds.pivot(index='id', columns='d', values='demand').reset_index() # split test1 / test2 preds1 = preds[['id'] + COLS_TEST1] preds2 = preds[['id'] + COLS_TEST2] # change column names preds1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] preds2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # replace test2 id preds2['id'] = preds2['id'].str.replace('_validation', '_evaluation') # merge preds = preds1.append(preds2) # save csv preds.to_csv(submission_file_name, index=False) # submission by API submit(submission_file_name, comment='model301 recursive prediction')