def main(): # load submission files print('load files...') sub_weekday = pd.read_csv('../output/submission_lgbm_weekday.csv') sub_holiday = pd.read_csv('../output/submission_lgbm_holiday.csv') # load oof files oof_weekday = pd.read_csv('../output/oof_lgbm_cv_weekday.csv') oof_holiday = pd.read_csv('../output/oof_lgbm_cv_holiday.csv') # merge sub = sub_weekday.append(sub_holiday) oof = oof_weekday.append(oof_holiday) del sub_weekday, sub_holiday, oof_weekday, oof_holiday gc.collect() # to pivot print('to pivot...') sub = sub.pivot(index='id', columns='d', values='demand').reset_index() oof = oof.pivot(index='id', columns='d', values='demand').reset_index() # split test1 / test2 sub1 = oof[['id'] + COLS_TEST1] sub2 = sub[['id'] + COLS_TEST2] # change column names sub1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] sub2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # replace test1 id sub1['id'] = sub1['id'].str.replace('_evaluation', '_validation') # merge sub = sub1.append(sub2) # postprocesssing cols_f = [f'F{i}' for i in range(1, 29)] cols_d = [c for c in oof.columns if 'd_' in c] sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0) oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0) # save csv sub.to_csv(submission_file_name, index=False) oof.to_csv(oof_file_name, index=False) # calc out of fold WRMSSE score print('calc oof cv scores...') scores = calc_score_cv(oof) score = np.mean(scores) print(f'scores: {scores}') # submission by API # submit(submission_file_name, comment='model410 cv: %.6f' % score) # LINE notify line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))
def main(): # load pkls df = read_pickles('../feats/sales_diff') df_calendar = loadpkl('../feats/calendar.pkl') df_sell_prices = loadpkl('../feats/sell_prices.pkl') # merge df = df.merge(df_calendar, on='d',how='left') df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left') del df_calendar, df_sell_prices gc.collect() # drop pre-release rows df = df[df['wm_yr_wk']>=df['release']] # make lag features df = make_lags(df,28) # label encoding cols_string = ['item_id','dept_id','cat_id','store_id','state_id'] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1,np.nan,inplace=True) # add price features df_grouped = df[['id','sell_price']].groupby('id')['sell_price'] df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1)) df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1']) df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max()) df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365']) df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std()) df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std()) # features release date df['release'] = df['release'] - df['release'].min() # price momentum by month & year df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean') df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean') # days for CustomTimeSeriesSplitter df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int) # reduce memory usage df = reduce_mem_usage(df) # save as feather to_feature(df, '../feats/f105') # save feature name list features_json = {'features':df.columns.tolist()} to_json(features_json,'../configs/105_all_features_diff.json') # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def post_process(lp, img_path): if len(lp) == 0: msg = "Open Gate: " + AZURE + "\nLP : Not detected" else: msg = "Open Gate: " + AZURE + "\nLP : " + lp try: r = requests.post(TF_SERVING, files={'media': open(img_path, 'rb')}) if r.status_code == 404: line_notify(msg, img_path, False) else: with open('lp.jpg', 'wb') as f: f.write(r.content) line_notify(msg, 'lp.jpg', False) except: print("Error sending request") pass
def main(is_eval=False): # load csv df = pd.read_csv('../input/sell_prices.csv') # release week ref https://www.kaggle.com/kyakovlev/m5-simple-fe release_df = df.groupby(['store_id', 'item_id' ])['wm_yr_wk'].agg(['min']).reset_index() release_df.columns = ['store_id', 'item_id', 'release'] # merge release week df = df.merge(release_df, on=['store_id', 'item_id'], how='left') # days from release df['days_from_release'] = df['wm_yr_wk'] - df['release'] # basic aggregations df['price_max'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('max') df['price_min'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('min') df['price_std'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('std') df['price_mean'] = df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean') # normalized price df['price_norm'] = df['sell_price'] / df['price_max'] # label encoding df['price_nunique'] = df.groupby(['store_id', 'item_id' ])['sell_price'].transform('nunique') df['item_nunique'] = df.groupby(['store_id', 'sell_price' ])['item_id'].transform('nunique') # momentum df['price_momentum'] = df['sell_price'] / df.groupby( ['store_id', 'item_id'])['sell_price'].transform(lambda x: x.shift(1)) # reduce memory usage df = reduce_mem_usage(df) # save pkl save2pkl('../feats/sell_prices.pkl', df) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(): # submitファイルをロード sub = pd.read_csv("../input/sample_submit.tsv", sep='\t', header=None) sub_lgbm = pd.read_csv("../output/submission_lgbm.tsv", sep='\t', header=None) sub_xgb = pd.read_csv("../output/submission_xgb.tsv", sep='\t', header=None) # カラム名を変更 sub.columns = ['index', 'visitors'] sub_lgbm.columns = ['index', 'visitors'] sub_xgb.columns = ['index', 'visitors'] # merge sub.loc[:, 'visitors'] = 0.5 * sub_lgbm['visitors'] + 0.5 * sub_xgb['visitors'] del sub_lgbm, sub_xgb gc.collect() # out of foldの予測値をロード oof_lgbm = pd.read_csv("../output/oof_lgbm.csv") oof_xgb = pd.read_csv("../output/oof_xgb.csv") oof_preds = 0.5 * oof_lgbm['OOF_PRED'] + 0.5 * oof_xgb['OOF_PRED'] # train_dfをロード train_df = loadpkl('../output/train_df.pkl') train_df = train_df.sort_values('index') # local cv scoreを算出 local_mae = mean_absolute_error(train_df['visitors'], oof_preds) # LINE通知 line_notify('Blend Local MAE score %.6f' % local_mae) del oof_lgbm, oof_xgb gc.collect() # save submit file sub[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t')
def main(num_rows=None): # load csv & pkl profiles = pd.read_csv('../input/data_set_phase2/profiles.csv') # change columns name profiles.columns = ['pid']+['profile_{}'.format(i) for i in range(0,66)] # feature engineering feats = [f for f in profiles.columns.to_list() if f not in ['pid']] profiles['profile_sum'] = profiles[feats].mean(axis=1) profiles['profile_mean'] = profiles[feats].sum(axis=1) profiles['profile_std'] = profiles[feats].std(axis=1) profiles['profile_sum_count'] = profiles['profile_sum'].map(profiles['profile_sum'].value_counts()) # svd features svd = TruncatedSVD(n_components=20, n_iter=20, random_state=326) svd_x = svd.fit_transform(profiles[feats].values) svd_x = pd.DataFrame(svd_x) svd_x.columns = ['profile_svd_{}'.format(i) for i in range(20)] svd_x['pid'] = profiles['pid'] # merge profiles = profiles.merge(svd_x, on='pid', how='left') # NMF features nmf = NMF(n_components=20, init='random', random_state=326) nmf_x = nmf.fit_transform(profiles[feats].values) nmf_x = pd.DataFrame(nmf_x) nmf_x.columns = ['profile_nmf_{}'.format(i) for i in range(20)] nmf_x['pid'] = profiles['pid'] # merge profiles = profiles.merge(nmf_x, on='pid', how='left') # k-means clustering kmeans_model = KMeans(n_clusters=10, random_state=326) kmeans_model.fit(profiles[feats].values) profiles['profile_k_means'] = kmeans_model.labels_ # save as pkl save2pkl('../features/profiles.pkl', profiles) line_notify('{} finished.'.format(sys.argv[0]))
def main(): # reg for bayesian optimization reg_bo = BayesianOptimization( xgb_eval, { 'gamma': (0, 1), 'max_depth': (3, 8), 'min_child_weight': (0, 45), 'subsample': (0.001, 1), 'colsample_bytree': (0.001, 1), 'colsample_bylevel': (0.001, 1), 'alpha': (9, 20), '_lambda': (0, 10) }) reg_bo.maximize(init_points=15, n_iter=25) res = pd.DataFrame(reg_bo.res['max']['max_params'], index=['max_params']) res.to_csv('../output/max_params_xgb.csv') line_notify('Bayes Opt XGBoost finished.')
def main(): # clf for bayesian optimization clf_bo = BayesianOptimization( lgbm_eval, { 'num_leaves': (16, 64), 'colsample_bytree': (0.001, 1), 'subsample': (0.001, 1), 'max_depth': (8, 16), 'reg_alpha': (0, 10), 'reg_lambda': (0, 10), 'min_split_gain': (0, 1), 'min_child_weight': (0, 45), 'min_data_in_leaf': (0, 500), }) clf_bo.maximize(init_points=15, n_iter=25) res = pd.DataFrame(clf_bo.res['max']['max_params'], index=['max_params']) res.to_csv('../output/max_params_lgbm.csv') line_notify('Bayes Opt LightGBM finished.')
def main(): # load submission files print('load files...') sub = pd.read_csv(submission_file_name) # load out of fold files oof = pd.read_csv(oof_file_name) # to pivot print('to pivot...') oof = oof.pivot(index='id', columns='d', values='demand').reset_index() # fill na oof.fillna(0, inplace=True) # postprocesssing cols_f = [f'F{i}' for i in range(1, 29)] cols_d = [c for c in oof.columns if 'd_' in c] sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0) oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0) # save csv sub.to_csv(submission_file_name, index=False) oof.to_csv(oof_file_name_pivot, index=False) # calc out of fold WRMSSE score print('calc oof cv scores...') scores = calc_score_cv(oof) score = np.mean(scores) print(f'scores: {scores}') # submission by API # submit(submission_file_name, comment='model401 cv: %.6f' % score) # LINE notify line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))
def main(num_rows=None): # load csv train_queries = pd.read_csv( '../input/data_set_phase2/train_queries_phase2.csv', nrows=num_rows) test_queries = pd.read_csv('../input/data_set_phase2/test_queries.csv', nrows=num_rows) train_clicks = pd.read_csv( '../input/data_set_phase2/train_clicks_phase2.csv') # phase 1 csv train_queries1 = pd.read_csv( '../input/data_set_phase2/train_queries_phase1.csv') train_clicks1 = pd.read_csv( '../input/data_set_phase2/train_clicks_phase1.csv') # merge click train_queries = pd.merge(train_queries, train_clicks[['sid', 'click_mode']], on='sid', how='left') train_queries1 = pd.merge(train_queries1, train_clicks1[['sid', 'click_mode']], on='sid', how='left') # merge phase 1 data train_queries = train_queries1.append(train_queries) # fill na (no click) train_queries['click_mode'].fillna(0, inplace=True) # set test target as nan test_queries['click_mode'] = np.nan # merge train & test queries_df = train_queries.append(test_queries) del train_queries, test_queries, train_queries1, train_clicks, train_clicks1 gc.collect() # to datetime queries_df['req_time'] = pd.to_datetime(queries_df['req_time']) # distance features queries_df['x_o'] = queries_df['o'].apply( lambda x: x.split(',')[0]).astype(float) queries_df['y_o'] = queries_df['o'].apply( lambda x: x.split(',')[1]).astype(float) queries_df['x_d'] = queries_df['d'].apply( lambda x: x.split(',')[0]).astype(float) queries_df['y_d'] = queries_df['d'].apply( lambda x: x.split(',')[1]).astype(float) # count features queries_df['queries_o_count'] = queries_df['o'].map( queries_df['o'].value_counts()) queries_df['queries_d_count'] = queries_df['d'].map( queries_df['d'].value_counts()) queries_df['queries_x_o_count'] = queries_df['x_o'].map( queries_df['x_o'].value_counts()) queries_df['queries_y_o_count'] = queries_df['y_o'].map( queries_df['y_o'].value_counts()) queries_df['queries_x_d_count'] = queries_df['x_d'].map( queries_df['x_d'].value_counts()) queries_df['queries_y_d_count'] = queries_df['y_d'].map( queries_df['y_d'].value_counts()) queries_df['queries_distance'] = np.sqrt( (queries_df['x_o'] - queries_df['x_d'])**2 + (queries_df['y_o'] - queries_df['y_d'])**2) queries_df['o_d'] = queries_df['o'].astype( str) + '_' + queries_df['d'].astype(str) queries_df['queries_o_d_count'] = queries_df['o_d'].map( queries_df['o_d'].value_counts()) # datetime features queries_df['queries_weekday'] = queries_df['req_time'].dt.weekday queries_df['queries_hour'] = queries_df['req_time'].dt.hour queries_df['queries_is_holiday'] = queries_df['req_time'].apply( lambda x: is_holiday(x)).astype(int) queries_df['queries_weekday_count'] = queries_df['queries_weekday'].map( queries_df['queries_weekday'].value_counts()) queries_df['queries_hour_count'] = queries_df['queries_hour'].map( queries_df['queries_hour'].value_counts()) # coordinate & datetime features queries_df['o_d_is_holiday'] = queries_df['queries_is_holiday'].astype( str) + '_' + queries_df['o_d'] queries_df['o_d_weekday'] = queries_df['queries_weekday'].astype( str) + '_' + queries_df['o_d'] queries_df['o_d_hour'] = queries_df['queries_hour'].astype( str) + '_' + queries_df['o_d'] queries_df['o_is_holiday'] = queries_df['queries_is_holiday'].astype( str) + '_' + queries_df['o'] queries_df['o_weekday'] = queries_df['queries_weekday'].astype( str) + '_' + queries_df['o'] queries_df['o_hour'] = queries_df['queries_hour'].astype( str) + '_' + queries_df['o'] queries_df['d_is_holiday'] = queries_df['queries_is_holiday'].astype( str) + '_' + queries_df['d'] queries_df['d_weekday'] = queries_df['queries_weekday'].astype( str) + '_' + queries_df['d'] queries_df['d_hour'] = queries_df['queries_hour'].astype( str) + '_' + queries_df['d'] queries_df['queries_o_d_is_holiday_count'] = queries_df[ 'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts()) queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map( queries_df['o_d_weekday'].value_counts()) queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map( queries_df['o_d_hour'].value_counts()) queries_df['queries_o_is_holiday_count'] = queries_df[ 'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts()) queries_df['queries_o_weekday_count'] = queries_df['o_d_weekday'].map( queries_df['o_d_weekday'].value_counts()) queries_df['queries_o_hour_count'] = queries_df['o_d_hour'].map( queries_df['o_d_hour'].value_counts()) queries_df['queries_o_d_is_holiday_count'] = queries_df[ 'o_d_is_holiday'].map(queries_df['o_d_is_holiday'].value_counts()) queries_df['queries_o_d_weekday_count'] = queries_df['o_d_weekday'].map( queries_df['o_d_weekday'].value_counts()) queries_df['queries_o_d_hour_count'] = queries_df['o_d_hour'].map( queries_df['o_d_hour'].value_counts()) # rounded value features queries_df['x_o_round'] = queries_df['x_o'].round(1) queries_df['y_o_round'] = queries_df['y_o'].round(1) queries_df['x_d_round'] = queries_df['x_d'].round(1) queries_df['y_d_round'] = queries_df['y_d'].round(1) queries_df['queries_distance_round'] = queries_df[ 'queries_distance'].round(1) queries_df['o_round'] = queries_df['x_o_round'].astype( str) + '_' + queries_df['y_o_round'].astype(str) queries_df['d_round'] = queries_df['x_d_round'].astype( str) + '_' + queries_df['y_d_round'].astype(str) queries_df['o_d_round'] = queries_df['o_round'].astype( str) + '_' + queries_df['d_round'].astype(str) queries_df['queries_x_o_round_count'] = queries_df['x_o_round'].map( queries_df['x_o_round'].value_counts()) queries_df['queries_y_o_round_count'] = queries_df['y_o_round'].map( queries_df['y_o_round'].value_counts()) queries_df['queries_x_d_round_count'] = queries_df['x_d_round'].map( queries_df['x_d_round'].value_counts()) queries_df['queries_y_d_round_count'] = queries_df['y_d_round'].map( queries_df['y_d_round'].value_counts()) queries_df['queries_distance_round_count'] = queries_df[ 'queries_distance_round'].map( queries_df['queries_distance_round'].value_counts()) queries_df['queries_o_round_count'] = queries_df['o_round'].map( queries_df['o_round'].value_counts()) queries_df['queries_d_round_count'] = queries_df['d_round'].map( queries_df['d_round'].value_counts()) queries_df['queries_o_d_round_count'] = queries_df['o_d_round'].map( queries_df['o_d_round'].value_counts()) # factorize queries_df['x_o_round'], _ = pd.factorize(queries_df['x_o_round']) queries_df['y_o_round'], _ = pd.factorize(queries_df['y_o_round']) queries_df['x_d_round'], _ = pd.factorize(queries_df['x_d_round']) queries_df['y_d_round'], _ = pd.factorize(queries_df['y_d_round']) queries_df['queries_distance_round'], _ = pd.factorize( queries_df['queries_distance_round']) # target encoding cols_encoding = [ 'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round', 'queries_distance_round' ] queries_df = targetEncodingMultiClass(queries_df, 'click_mode', cols_encoding) # drop string features cols_drop = [ 'o', 'd', 'o_d', 'o_d_is_holiday', 'o_d_weekday', 'o_d_hour', 'o_is_holiday', 'o_weekday', 'o_hour', 'd_is_holiday', 'd_weekday', 'd_hour', 'o_round', 'd_round', 'o_d_round' ] queries_df.drop(cols_drop, axis=1, inplace=True) # reduce memory usage queries_df = reduce_mem_usage(queries_df) # save as pkl save2pkl('../features/queries.pkl', queries_df) # save configs configs = json.load(open('../configs/101_lgbm_queries.json')) configs['features'] = queries_df.columns.to_list() to_json(configs, '../configs/101_lgbm_queries.json') line_notify('{} finished.'.format(sys.argv[0]))
def main(is_eval=False): # load csv if is_eval: df = pd.read_csv('../input/sales_train_evaluation.csv') else: df = pd.read_csv('../input/sales_train_validation.csv') sub = pd.read_csv('../input/sample_submission.csv') # split test data sub['is_test1'] = sub['id'].apply(lambda x: True if '_validation' in x else False) sub['is_test2'] = sub['id'].apply(lambda x: True if '_evaluation' in x else False) test1 = sub[sub['is_test1']] test2 = sub[sub['is_test2']] del sub gc.collect() # drop flags test1.drop(['is_test1', 'is_test2'], axis=1, inplace=True) test2.drop(['is_test1', 'is_test2'], axis=1, inplace=True) # change column name test1.columns = ['id'] + COLS_TEST1 test2.columns = ['id'] + COLS_TEST2 # change id test2['id'] = test2['id'].str.replace('_evaluation', '_validation') # merge if not is_eval: df = df.merge(test1, on='id', how='left') df = df.merge(test2, on='id', how='left') del test1, test2 gc.collect() # reduce memory usage df = reduce_mem_usage(df) # date columns cols_date = [c for c in df.columns if 'd_' in c] # melt sales data print('Melting sales data...') id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'] df = pd.melt(df, id_vars=id_vars, var_name='d', value_name='demand') print('Melted sales train validation has {} rows and {} columns'.format( df.shape[0], df.shape[1])) # add numeric date df['d_numeric'] = df['d'].apply(lambda x: int(x[2:])) # drop old data (~2012/12/31) print('drop old data...') df = df[df['d_numeric'] >= 704] # drop christmas data print('drop christmas data...') df = df[df['d_numeric'] != 331] # 2011-12-25 df = df[df['d_numeric'] != 697] # 2012-12-25 df = df[df['d_numeric'] != 1062] # 2013-12-25 df = df[df['d_numeric'] != 1427] # 2014-12-25 df = df[df['d_numeric'] != 1792] # 2015-12-25 # add is zero flag df['is_zero'] = (df['demand'] == 0).astype(int) # save pkl to_pickles(df, '../feats/sales', split_size=3) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
verbose_eval=100, seed=326, ) gc.collect() return eval_dict['multi_logloss-mean'][-1] if __name__ == '__main__': study = optuna.create_study() study.optimize(objective, n_trials=100) print('Number of finished trials: {}'.format(len(study.trials))) print('Best trial:') trial = study.best_trial print(' Value: {}'.format(trial.value)) print(' Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) # save result hist_df = study.trials_dataframe() hist_df.to_csv("../output/optuna_result_lgbm_queries.csv") # save json CONFIGS['params'] = trial.params to_json(CONFIGS, '../configs/101_lgbm_queries.json') line_notify('{} finished. Value: {}'.format(sys.argv[0],trial.value))
metrics=['rmse'], nfold=NUM_FOLDS, # folds=folds.split(TRAIN_DF[FEATS], TRAIN_DF['park_japanese_holiday']), num_boost_round=10000, # early stopありなのでここは大きめの数字にしてます early_stopping_rounds=200, verbose_eval=100, seed=47) gc.collect() return clf['test-rmse-mean'][-1] if __name__ == '__main__': study = optuna.create_study() study.optimize(objective, n_trials=30) print('Number of finished trials: {}'.format(len(study.trials))) print('Best trial:') trial = study.best_trial print(' Value: {}'.format(trial.value)) print(' Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) # save result hist_df = study.trials_dataframe() hist_df.to_csv("../output/optuna_result_xgb.csv") line_notify('optuna XGBoost finished.')
def kfold_xgboost(train_df, test_df, num_folds, stratified=False, debug=False): print("Starting XGBoost. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0], 12)) sub_preds = np.zeros((test_df.shape[0], 12)) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # dmatrix for test_df test_df_dmtrx = xgb.DMatrix(test_df[feats]) # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['click_mode'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'click_mode'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'click_mode'].iloc[valid_idx] # set data structure xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_test = xgb.DMatrix(valid_x, label=valid_y) # params params = { 'device': 'gpu', 'objective': 'multi:softmax', # GPU parameter 'booster': 'gbtree', 'eval_metric': 'mlogloss', 'num_class': 12, 'eta': 0.05, 'colsample_bytree': 0.3490457769968177, 'subsample': 0.543646263362097, 'max_depth': 11, 'alpha': 4.762312990232561, 'lambda': 9.98131082276387, 'gamma': 0.19161156850826594, 'min_child_weight': 15.042054927368088, 'tree_method': 'gpu_hist', # GPU parameter 'predictor': 'gpu_predictor', # GPU parameter 'silent': 1, 'seed': int(2**n_fold) } # train model clf = xgb.train(params, xgb_train, num_boost_round=10000, evals=[(xgb_train, 'train'), (xgb_test, 'test')], early_stopping_rounds=200, verbose_eval=100) # save model clf.save_model('../output/xgb_' + str(n_fold) + '.txt') oof_preds[valid_idx] = clf.predict(xgb_test, output_margin=True) sub_preds += clf.predict(test_df_dmtrx, output_margin=True) / folds.n_splits # save feature importances fold_importance_df = pd.DataFrame.from_dict( clf.get_score(importance_type='gain'), orient='index', columns=['importance']) fold_importance_df["feature"] = fold_importance_df.index.tolist() fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d F1 Score : %.6f' % (n_fold + 1, f1_score(valid_y, np.argmax(oof_preds[valid_idx], axis=1), average='weighted'))) del clf, train_x, train_y, valid_x, valid_y, xgb_train, xgb_test gc.collect() # Full F1 Score & LINE Notify full_f1 = f1_score(train_df['click_mode'], np.argmax(oof_preds, axis=1), average='weighted') print('Full F1 Score %.6f' % full_f1) line_notify('Full F1 Score %.6f' % full_f1) # display importances display_importances(feature_importance_df, '../imp/xgb_importances.png', '../imp/feature_importance_xgb.csv') if not debug: # save prediction for submit test_df['recommend_mode'] = np.argmax(sub_preds, axis=1) test_df = test_df.reset_index() # post processing test_df['recommend_mode'][(test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0 )] = test_df['plan_0_transport_mode'][ (test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0)] test_df[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # save out of fold prediction train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1) train_df = train_df.reset_index() train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) # save prediction for submit sub_preds = pd.DataFrame(sub_preds) sub_preds.columns = [ 'pred_xgb_plans{}'.format(c) for c in sub_preds.columns ] sub_preds['sid'] = test_df['sid'] sub_preds['click_mode'] = test_df['click_mode'] # save out of fold prediction oof_preds = pd.DataFrame(oof_preds) oof_preds.columns = [ 'pred_xgb_plans{}'.format(c) for c in oof_preds.columns ] oof_preds['sid'] = train_df['sid'] oof_preds['click_mode'] = train_df['click_mode'] # merge df = oof_preds.append(sub_preds) # save as pkl save2pkl('../features/xgb_pred.pkl', df) line_notify('{} finished.'.format(sys.argv[0]))
def main(): # load predictions pred_lgbm = loadpkl('../features/lgbm_pred.pkl') pred_xgb = loadpkl('../features/xgb_pred.pkl') plans = loadpkl('../features/plans.pkl') # define columns name list cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)] cols_pred_xgb = ['pred_xgb_plans{}'.format(i) for i in range(0, 12)] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] # merge plans & pred pred = pred_lgbm[['sid', 'click_mode']] pred = pd.merge(pred, plans[cols_transport_mode + ['sid', 'plan_num_plans']], on='sid', how='left') del plans gc.collect() # scaling predictions pred_lgbm[cols_pred_lgbm] = scalingPredictions(pred_lgbm[cols_pred_lgbm]) pred_xgb[cols_pred_xgb] = scalingPredictions(pred_xgb[cols_pred_xgb]) # reset index pred_lgbm.reset_index(inplace=True, drop=True) pred_xgb.reset_index(inplace=True, drop=True) # fill predictions for non-exist plans as zero for i in range(1, 12): tmp = np.zeros(len(pred)) for c in cols_transport_mode: tmp += (pred[c] == i).astype(int) pred_lgbm['pred_lgbm_plans{}'.format( i)] = pred_lgbm['pred_lgbm_plans{}'.format(i)] * (tmp > 0) pred_xgb['pred_xgb_plans{}'.format( i)] = pred_xgb['pred_xgb_plans{}'.format(i)] * (tmp > 0) # get best weight for lgbm & xgboost oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()] oof_pred_xgb = pred_xgb[pred_xgb['click_mode'].notnull()] w = getBestWeights(oof_pred_lgbm.click_mode, oof_pred_lgbm, oof_pred_xgb, '../imp/weight.png') # calc prediction for each class cols_pred = [] for i in range(0, 12): pred['pred_{}'.format(i)] = w * pred_lgbm['pred_lgbm_plans{}'.format( i)] + (1.0 - w) * pred_xgb['pred_xgb_plans{}'.format(i)] cols_pred.append('pred_{}'.format(i)) # get out of fold values oof_pred = pred[pred['click_mode'].notnull()] # get best multiples m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred, '../imp/multiple4.png') pred['pred_4'] *= m4 oof_pred['pred_4'] *= m4 m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred, '../imp/multiple0.png') pred['pred_0'] *= m0 oof_pred['pred_0'] *= m0 m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred, '../imp/multiple3.png') pred['pred_3'] *= m3 oof_pred['pred_3'] *= m3 m6 = getBestMultiple(oof_pred, 'pred_6', cols_pred, '../imp/multiple6.png') pred['pred_6'] *= m6 oof_pred['pred_6'] *= m6 # get recommend mode pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1) # if number of plans = 1 and recommend mode != 0, set recommend mode as plan 0 mode. pred['recommend_mode'][(pred['plan_num_plans'] == 1) & ( pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][ (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)] # split train & test sub_pred = pred[pred['click_mode'].isnull()] oof_pred = pred[pred['click_mode'].notnull()] # out of fold score oof_f1_score = f1_score(oof_pred['click_mode'], oof_pred['recommend_mode'], average='weighted') # save csv oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # line notify line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))
def kfold_lightgbm(train_df, test_df, num_folds): print('Starting LightGBM. Train shape: {}'.format(train_df.shape)) # Cross validation folds = CustomTimeSeriesSplitter(end_train=1941) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] valid_idxs=[] avg_best_iteration = 0 # average of best iteration # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df)): # split train/valid train_x, train_y = train_df[feats].iloc[train_idx], train_df['demand'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['demand'].iloc[valid_idx] # save validation indexes valid_idxs += list(valid_idx) # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) params ={ # 'device' : 'gpu', # 'gpu_use_dp':True, 'boosting': 'gbdt', 'metric': ['rmse'], 'objective':'tweedie', 'learning_rate': 0.05, 'tweedie_variance_power':1.1, 'subsample': 0.5, 'subsample_freq': 1, 'num_leaves': 2**8-1, 'min_data_in_leaf': 2**8-1, 'feature_fraction': 0.8, 'verbose': -1, 'seed':326, 'bagging_seed':326, 'drop_seed':326, 'num_threads':-1 } # train model reg = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=10 ) # save model reg.save_model(f'../output/lgbm_holiday_{n_fold}.txt') # save predictions oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) # save best iteration avg_best_iteration += reg.best_iteration / folds.n_splits # save feature importances fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = feats fold_importance_df['importance'] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df['fold'] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_cv_holiday.png', '../imp/feature_importance_lgbm_cv_holiday.csv') # Full RMSE score and LINE Notify full_rmse = rmse(train_df['demand'][valid_idxs], oof_preds[valid_idxs]) line_notify('Full RMSE score %.6f' % full_rmse) # save out of fold prediction train_df.loc[:,'demand'] = oof_preds train_df[['id','d','demand']].to_csv(oof_file_name, index=False) # save number of best iteration configs['num_boost_round'] = int(avg_best_iteration) configs['rmse'] = full_rmse to_json(configs, '../configs/310_train_holiday.json') # LINE notify line_notify('{} done. best iteration:{}'.format(sys.argv[0],int(avg_best_iteration)))
def kfold_lightgbm(train_df, test_df, num_folds, stratified=False, debug=False): print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0], 12)) sub_preds = np.zeros((test_df.shape[0], 12)) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['click_mode'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'click_mode'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'click_mode'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, categorical_feature=CAT_COLS, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, categorical_feature=CAT_COLS, free_raw_data=False) # params params = { 'device': 'gpu', 'task': 'train', 'boosting': 'gbdt', 'objective': 'multiclass', 'metric': 'multiclass', 'learning_rate': 0.01, 'num_class': 12, 'num_leaves': 52, 'colsample_bytree': 0.3490457769968177, 'subsample': 0.543646263362097, 'max_depth': 11, 'reg_alpha': 4.762312990232561, 'reg_lambda': 9.98131082276387, 'min_split_gain': 0.19161156850826594, 'min_child_weight': 15.042054927368088, 'min_data_in_leaf': 17, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } clf = lgb.train( params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], # feval=eval_f, num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model clf.save_model('../output/lgbm_3_{}.txt'.format(n_fold)) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict( test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( clf.feature_importance(importance_type='gain', iteration=clf.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d F1 Score : %.6f' % (n_fold + 1, f1_score(valid_y, np.argmax(oof_preds[valid_idx], axis=1), average='weighted'))) del clf, train_x, train_y, valid_x, valid_y gc.collect() # Full F1 Score & LINE Notify full_f1 = f1_score(train_df['click_mode'], np.argmax(oof_preds, axis=1), average='weighted') print('Full F1 Score %.6f' % full_f1) line_notify('Full F1 Score %.6f' % full_f1) # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_3.png', '../imp/feature_importance_lgbm_3.csv') if not debug: # save prediction for submit test_df['recommend_mode'] = np.argmax(sub_preds, axis=1) test_df = test_df.reset_index() # post processing test_df['recommend_mode'][(test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0 )] = test_df['plan_0_transport_mode'][ (test_df['plan_num_plans'] == 1) & (test_df['recommend_mode'] != 0)] # save csv test_df[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # save out of fold prediction train_df.loc[:, 'recommend_mode'] = np.argmax(oof_preds, axis=1) train_df = train_df.reset_index() train_df[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) # save prediction for submit sub_preds = pd.DataFrame(sub_preds) sub_preds.columns = [ 'pred_lgbm_plans{}'.format(c) for c in sub_preds.columns ] sub_preds['sid'] = test_df['sid'] sub_preds['click_mode'] = test_df['click_mode'] # save out of fold prediction oof_preds = pd.DataFrame(oof_preds) oof_preds.columns = [ 'pred_lgbm_plans{}'.format(c) for c in oof_preds.columns ] oof_preds['sid'] = train_df['sid'] oof_preds['click_mode'] = train_df['click_mode'] # merge df = oof_preds.append(sub_preds) # save as pkl save2pkl('../features/lgbm_pred_3.pkl', df) line_notify('{} finished.'.format(sys.argv[0]))
def kfold_lightgbm(train_df, test_df, num_folds): print('Starting LightGBM. Train shape: {}'.format(train_df.shape)) # Cross validation folds = GroupKFold(n_splits=num_folds) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] group = train_df['month'].astype(str) + '_' + train_df['year'].astype(str) # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], groups=group)): # split train/valid train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'demand'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'demand'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) params = { # 'device' : 'gpu', # 'gpu_use_dp':True, 'boosting': 'gbdt', 'metric': ['rmse'], 'objective': 'tweedie', 'learning_rate': 0.05, 'tweedie_variance_power': 1.1, 'subsample': 0.5, 'subsample_freq': 1, 'num_leaves': 2**8 - 1, 'min_data_in_leaf': 2**8 - 1, 'feature_fraction': 0.8, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold), 'num_threads': -1 } # train model reg = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model reg.save_model(f'../output/lgbm_group_k_fold_21days_{n_fold}.txt') # save predictions oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration) sub_preds += reg.predict( test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits # save feature importances fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = feats fold_importance_df['importance'] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df['fold'] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # display importances display_importances( feature_importance_df, '../imp/lgbm_importances_group_k_fold_21days.png', '../imp/feature_importance_lgbm_group_k_fold_21days.csv') # Full RMSE score and LINE Notify full_rmse = rmse(train_df['demand'], oof_preds) line_notify('Full RMSE score %.6f' % full_rmse) # save out of fold prediction train_df.loc[:, 'demand'] = oof_preds train_df = train_df.reset_index() train_df[['id', 'd', 'demand']].to_csv(oof_file_name, index=False) # reshape prediction for submit test_df.loc[:, 'demand'] = sub_preds test_df = test_df.reset_index() preds = test_df[['id', 'd', 'demand']].reset_index() # save csv preds.to_csv(submission_file_name, index=False) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(): # load predictions pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl') pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl') pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl') plans = read_pickles('../features/plans') preds = [pred_lgbm1, pred_lgbm2, pred_lgbm3] # define columns name list cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] # remove columns cols_drop = [ c for c in plans.columns if c not in cols_transport_mode + ['sid', 'plan_num_plans', 'click_mode'] ] plans.drop(cols_drop, axis=1, inplace=True) # postprocessing sub_preds = [] oof_preds = [] for i, pred_lgbm in enumerate(preds): # merge plans & pred pred = pred_lgbm[['sid', 'click_mode']] pred = pd.merge(pred, plans[cols_transport_mode + ['sid', 'plan_num_plans']], on='sid', how='left') # scaling predictions pred_lgbm[cols_pred_lgbm] = scalingPredictions( pred_lgbm[cols_pred_lgbm]) # reset index pred_lgbm.reset_index(inplace=True, drop=True) # fill predictions for non-exist plans as zero for j in range(1, 12): tmp = np.zeros(len(pred)) for c in cols_transport_mode: tmp += (pred[c] == j).astype(int) pred_lgbm['pred_lgbm_plans{}'.format( j)] = pred_lgbm['pred_lgbm_plans{}'.format(j)] * (tmp > 0) # get best weight for lgbm & xgboost oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()] # calc prediction for each class cols_pred = [] for j in range(0, 12): pred['pred_{}'.format(j)] = pred_lgbm['pred_lgbm_plans{}'.format( j)] cols_pred.append('pred_{}'.format(j)) # get out of fold values oof_pred = pred[pred['click_mode'].notnull()] # get best multiples m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred, '../imp/multiple0_{}.png'.format(i + 1)) pred['pred_0'] *= m0 oof_pred['pred_0'] *= m0 m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred, '../imp/multiple3_{}.png'.format(i + 1)) pred['pred_3'] *= m3 oof_pred['pred_3'] *= m3 m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred, '../imp/multiple4_{}.png'.format(i + 1)) pred['pred_4'] *= m4 oof_pred['pred_4'] *= m4 # get recommend mode pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1) # if number of plans = 1 and recommend mode != 0, fill recommend mode with plan 0 mode. pred['recommend_mode'][(pred['plan_num_plans'] == 1) & ( pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][ (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)] # split train & test _sub_pred = pred[pred['click_mode'].isnull()] _oof_pred = pred[pred['click_mode'].notnull()] sub_preds.append(_sub_pred) oof_preds.append(_oof_pred) del pred, _sub_pred, _oof_pred gc.collect() # merge preds sub_pred = sub_preds[0].append(sub_preds[1]) sub_pred = sub_pred.append(sub_preds[2]) sub_pred = pd.merge( plans[plans['click_mode'].isnull()][['sid', 'click_mode']], sub_pred[['sid', 'recommend_mode']], on='sid', how='left') oof_pred = oof_preds[0].append(oof_preds[1]) oof_pred = oof_pred.append(oof_preds[2]) oof_pred = pd.merge( plans[plans['click_mode'].notnull()][['sid', 'click_mode']], oof_pred[['sid', 'recommend_mode']], on='sid', how='left') del sub_preds, oof_preds, plans # out of fold score oof_f1_score = f1_score(oof_pred['click_mode'], oof_pred['recommend_mode'], average='weighted') # save csv oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name, index=False) sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name, index=False) # line notify line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))
folds=folds.split(TRAIN_DF[FEATS], TRAIN_DF['park_japanese_holiday']), num_boost_round=10000, # early stopありなのでここは大きめの数字にしてます early_stopping_rounds=200, verbose_eval=100, seed=47, ) gc.collect() return clf['rmse-mean'][-1] if __name__ == '__main__': study = optuna.create_study() study.optimize(objective, n_trials=100) print('Number of finished trials: {}'.format(len(study.trials))) print('Best trial:') trial = study.best_trial print(' Value: {}'.format(trial.value)) print(' Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) # save result hist_df = study.trials_dataframe() hist_df.to_csv("../output/optuna_result_lgbm.csv") line_notify('optuna LightGBM finished.')
def main(num_rows=None): # load pkls df = read_pickles('../features/plans') queries = loadpkl('../features/queries.pkl') profiles = loadpkl('../features/profiles.pkl') queries_pred = loadpkl('../features/queries_pred.pkl') queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl') # merge df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left') df = pd.merge(df, profiles, on='pid', how='left') df = pd.merge(df, queries_pred, on='sid', how='left') df = pd.merge(df, queries_profiles_pred, on='sid', how='left') del queries, profiles, queries_pred, queries_profiles_pred gc.collect() # reduce memory usage df = reduce_mem_usage(df) # count features df['pid_count'] = df['pid'].map(df['pid'].value_counts()) # time diff df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int) # distance ratio cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)] for i, c in enumerate(cols_plan_distance): df['plan_queries_distance_ratio{}'.format( i)] = df[c] / df['queries_distance'] df['plan_queries_distance_diff{}'.format( i)] = df[c] - df['queries_distance'] # stats features for preds cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)] cols_pred_queries_profiles = [ 'pred_queries_profiles{}'.format(i) for i in range(0, 12) ] df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1) df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1) df['pred_queries_max'] = df[cols_pred_queries].max(axis=1) df['pred_queries_min'] = df[cols_pred_queries].min(axis=1) df['pred_queries_var'] = df[cols_pred_queries].var(axis=1) df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1) df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean( axis=1) df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum( axis=1) df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max( axis=1) df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min( axis=1) df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var( axis=1) df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew( axis=1) # stats features for each classes print('stats features...') for i in tqdm(range(0, 12)): cols = [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ] df['pred_mean{}'.format(i)] = df[cols].mean(axis=1) df['pred_sum{}'.format(i)] = df[cols].sum(axis=1) df['pred_max{}'.format(i)] = df[cols].max(axis=1) df['pred_min{}'.format(i)] = df[cols].min(axis=1) df['pred_var{}'.format(i)] = df[cols].var(axis=1) df['pred_skew{}'.format(i)] = df[cols].skew(axis=1) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1) df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1) df['target_max{}'.format(i)] = df[cols_target].max(axis=1) df['target_min{}'.format(i)] = df[cols_target].min(axis=1) df['target_var{}'.format(i)] = df[cols_target].var(axis=1) df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1) # post processing cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] print('post processing...') for i in tqdm(range(1, 12)): tmp = np.zeros(len(df)) for c in cols_transport_mode: tmp += (df[c] == i).astype(int) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] for c in cols_target + [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ]: df[c] = df[c] * (tmp > 0) # reduce memory usage df = reduce_mem_usage(df) # split data by city df1 = df[df['y_o'] > 37.5] df2 = df[df['y_o'] < 27.5] df3 = df[df['x_o'] > 120.0] del df gc.collect() # cols for target encoding cols_target_encoding = [ 'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour', 'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans', 'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round', 'queries_distance_round' ] cols_ratio_plan = [ 'plan_price_distance_ratio_max_plan', 'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan', 'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan', 'plan_distance_eta_ratio_min_plan', 'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan', 'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan', 'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan', 'plan_price_distance_eta_prod_max_plan', 'plan_price_distance_eta_prod_min_plan', 'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan', 'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan', 'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan', 'plan_price_distance_prod_ratio_0_max_plan', 'plan_price_distance_prod_ratio_0_min_plan', 'plan_price_eta_prod_ratio_0_max_plan', 'plan_price_eta_prod_ratio_0_min_plan', 'plan_distance_eta_prod_ratio_0_max_plan', 'plan_distance_eta_prod_ratio_0_min_plan', 'plan_price_distance_eta_prod_ratio_0_max_plan', 'plan_price_distance_eta_prod_ratio_0_min_plan' ] cols_min_max_plan = [ 'plan_distance_max_plan', 'plan_distance_min_plan', 'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan', 'plan_eta_min_plan' ] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [ 'profile_k_means' ] # target encoding for each cities print('traget encoding...') for i, df in tqdm(enumerate([df1, df2, df3])): # target encoding df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding) # change dtype for col in df.columns.tolist(): if df[col].dtypes == 'float16': df[col] = df[col].astype(np.float32) # remove missing variables col_missing = removeMissingVariables(df, 0.75) df.drop(col_missing, axis=1, inplace=True) # remove correlated variables col_drop = removeCorrelatedVariables(df, 0.95) df.drop(col_drop, axis=1, inplace=True) # save as feather to_feature(df, '../features/feats{}'.format(i + 1)) # save feature name list features_json = {'features': df.columns.tolist()} to_json(features_json, '../features/00{}_all_features.json'.format(i + 1)) del df gc.collect() line_notify('{} finished.'.format(sys.argv[0]))
def train_lightgbm(train_df, test_df, debug=False): print("Starting LightGBM. Train shape: {}".format(train_df.shape)) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # set data structure lgb_train = lgb.Dataset(train_df[feats], label=train_df['demand'], free_raw_data=False) params = { # 'device' : 'gpu', # 'gpu_use_dp':True, 'task': 'train', 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.1, 'bagging_fraction': 0.85, 'bagging_freq': 1, 'colsample_bytree': 0.85, 'colsample_bynode': 0.85, 'min_data_per_leaf': 25, 'num_leaves': 200, 'lambda_l1': 0.5, 'lambda_l2': 0.5, 'verbose': -1, 'seed': 326, 'bagging_seed': 326, 'drop_seed': 326, # 'num_threads':-1 } # train model reg = lgb.train( params, lgb_train, valid_sets=[lgb_train], verbose_eval=10, num_boost_round=configs['num_boost_round'], ) # save model reg.save_model('../output/lgbm_diff.txt') # save predictions oof_preds += reg.predict(train_df[feats], num_iteration=reg.best_iteration) sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) # save feature importances fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) del reg gc.collect() # Full RMSE score and LINE Notify full_rmse = rmse(train_df['demand'], oof_preds) line_notify('Full RMSE score %.6f' % full_rmse) # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_diff.png', '../imp/feature_importance_lgbm_diff.csv') if not debug: # save out of fold prediction train_df.loc[:, 'demand'] = oof_preds train_df = train_df.reset_index() train_df[['id', 'demand']].to_csv(oof_file_name, index=False) # reshape prediction for submit test_df.loc[:, 'demand'] = sub_preds test_df = test_df.reset_index() preds = test_df[['id', 'd', 'demand']].reset_index() preds = preds.pivot(index='id', columns='d', values='demand').reset_index() # split test1 / test2 preds1 = preds[['id'] + COLS_TEST1] preds2 = preds[['id'] + COLS_TEST2] # change column names preds1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] preds2.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # replace test2 id preds2['id'] = preds2['id'].str.replace('_validation', '_evaluation') # merge preds = preds1.append(preds2) # save csv preds.to_csv(submission_file_name, index=False) # submission by API # submit(submission_file_name, comment='model301 cv: %.6f' % full_rmse) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(): print('load files...') # load submission files sub_28days = pd.read_csv( '../output/submission_lgbm_group_k_fold_28days.csv') sub_21days = pd.read_csv( '../output/submission_lgbm_group_k_fold_21days.csv') sub_14days = pd.read_csv( '../output/submission_lgbm_group_k_fold_14days.csv') sub_7days = pd.read_csv('../output/submission_lgbm_group_k_fold_7days.csv') # load out of fold files oof_28days = pd.read_csv('../output/oof_lgbm_group_k_fold_28days.csv') oof_21days = pd.read_csv('../output/oof_lgbm_group_k_fold_21days.csv') oof_14days = pd.read_csv('../output/oof_lgbm_group_k_fold_14days.csv') oof_7days = pd.read_csv('../output/oof_lgbm_group_k_fold_7days.csv') # to pivot print('to pivot...') sub_28days = sub_28days.pivot(index='id', columns='d', values='demand').reset_index() sub_21days = sub_21days.pivot(index='id', columns='d', values='demand').reset_index() sub_14days = sub_14days.pivot(index='id', columns='d', values='demand').reset_index() sub_7days = sub_7days.pivot(index='id', columns='d', values='demand').reset_index() oof_28days = oof_28days.pivot(index='id', columns='d', values='demand').reset_index() oof_21days = oof_21days.pivot(index='id', columns='d', values='demand').reset_index() oof_14days = oof_14days.pivot(index='id', columns='d', values='demand').reset_index() oof_7days = oof_7days.pivot(index='id', columns='d', values='demand').reset_index() # change columns name sub_28days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] sub_21days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] sub_14days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] sub_7days.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # validation columns valid_col_28days_fold1 = [f'd_{i+1}' for i in range(1913 + 21, 1913 + 28)] valid_col_21days_fold1 = [f'd_{i+1}' for i in range(1913 + 14, 1913 + 21)] valid_col_14days_fold1 = [f'd_{i+1}' for i in range(1913 + 7, 1913 + 14)] valid_col_7days_fold1 = [f'd_{i+1}' for i in range(1913, 1913 + 7)] valid_col_28days_fold2 = [f'd_{i+1}' for i in range(1885 + 21, 1885 + 28)] valid_col_21days_fold2 = [f'd_{i+1}' for i in range(1885 + 14, 1885 + 21)] valid_col_14days_fold2 = [f'd_{i+1}' for i in range(1885 + 7, 1885 + 14)] valid_col_7days_fold2 = [f'd_{i+1}' for i in range(1885, 1885 + 7)] valid_col_28days_fold3 = [f'd_{i+1}' for i in range(1576 + 21, 1576 + 28)] valid_col_21days_fold3 = [f'd_{i+1}' for i in range(1576 + 14, 1576 + 21)] valid_col_14days_fold3 = [f'd_{i+1}' for i in range(1576 + 7, 1576 + 14)] valid_col_7days_fold3 = [f'd_{i+1}' for i in range(1576, 1576 + 7)] # merge oof files oof = oof_28days[['id'] + valid_col_28days_fold1].merge( oof_28days[['id'] + valid_col_28days_fold2], on='id', how='left') oof = oof.merge(oof_28days[['id'] + valid_col_28days_fold3], on='id', how='left') oof = oof.merge(oof_21days[['id'] + valid_col_21days_fold1], on='id', how='left') oof = oof.merge(oof_21days[['id'] + valid_col_21days_fold2], on='id', how='left') oof = oof.merge(oof_21days[['id'] + valid_col_21days_fold3], on='id', how='left') oof = oof.merge(oof_14days[['id'] + valid_col_14days_fold1], on='id', how='left') oof = oof.merge(oof_14days[['id'] + valid_col_14days_fold2], on='id', how='left') oof = oof.merge(oof_14days[['id'] + valid_col_14days_fold3], on='id', how='left') oof = oof.merge(oof_7days[['id'] + valid_col_7days_fold1], on='id', how='left') oof = oof.merge(oof_7days[['id'] + valid_col_7days_fold2], on='id', how='left') oof = oof.merge(oof_7days[['id'] + valid_col_7days_fold3], on='id', how='left') # split columns col_28days = [f'F{i+1}' for i in range(21, 28)] col_21days = [f'F{i+1}' for i in range(14, 21)] col_14days = [f'F{i+1}' for i in range(7, 14)] col_7days = [f'F{i+1}' for i in range(0, 7)] # merge sub = sub_7days[['id'] + col_7days].merge(sub_14days[['id'] + col_14days], on='id', how='left') sub = sub.merge(sub_21days[['id'] + col_21days], on='id', how='left') sub = sub.merge(sub_28days[['id'] + col_28days], on='id', how='left') # split test1 / test2 sub1 = oof[['id'] + COLS_TEST1] sub2 = sub[['id'] + ['F' + str(d + 1) for d in range(28)]] # change column names sub1.columns = ['id'] + ['F' + str(d + 1) for d in range(28)] # replace test1 id sub1['id'] = sub1['id'].str.replace('_evaluation', '_validation') # merge sub = sub1.append(sub2) # postprocesssing cols_f = [f'F{i}' for i in range(1, 29)] cols_d = [c for c in oof.columns if 'd_' in c] sub.loc[:, cols_f] = sub[cols_f].where(sub[cols_f] > 0, 0) oof.loc[:, cols_d] = oof[cols_d].where(oof[cols_d] > 0, 0) # calc out of fold WRMSSE score print('calc oof cv scores...') scores = calc_score_cv(oof) score = np.mean(scores) print(f'scores: {scores}') # save csv sub.to_csv(submission_file_name, index=False) oof.to_csv(oof_file_name_pivot, index=False) # submission by API # submit(submission_file_name, comment='model409 cv: %.6f' % score) # LINE notify line_notify('{} done. WRMSSE:{}'.format(sys.argv[0], round(score, 6)))
def kfold_xgboost(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['visitors'].notnull()] test_df = df[df['visitors'].isnull()] print("Starting XGBoost. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # save pkl save2pkl('../output/train_df.pkl', train_df) save2pkl('../output/test_df.pkl', test_df) # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # final predict用にdmatrix形式のtest dfを作っておきます test_df_dmtrx = xgb.DMatrix(test_df[feats], label=train_df['visitors']) # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['park_japanese_holiday'])): train_x, train_y = train_df[feats].iloc[train_idx], np.log1p(train_df['visitors'].iloc[train_idx]) valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p(train_df['visitors'].iloc[valid_idx]) # set data structure xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_test = xgb.DMatrix(valid_x, label=valid_y) # params params = { 'objective':'gpu:reg:linear', # GPU parameter 'booster': 'gbtree', 'eval_metric':'rmse', 'silent':1, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 19, 'gamma': 0.089444100759612, 'subsample': 0.91842954303314, 'colsample_bytree': 0.870658058238432, 'colsample_bylevel': 0.995353255250289, 'alpha':19.9615600411437, 'lambda': 2.53962270252528, 'tree_method': 'gpu_hist', # GPU parameter 'predictor': 'gpu_predictor', # GPU parameter 'seed':int(2**n_fold) } reg = xgb.train( params, xgb_train, num_boost_round=10000, evals=[(xgb_train,'train'),(xgb_test,'test')], early_stopping_rounds= 200, verbose_eval=100 ) # save model reg.save_model('../output/xgb_'+str(n_fold)+'.txt') oof_preds[valid_idx] = np.expm1(reg.predict(xgb_test)) sub_preds += np.expm1(reg.predict(test_df_dmtrx)) / num_folds fold_importance_df = pd.DataFrame.from_dict(reg.get_score(importance_type='gain'), orient='index', columns=['importance']) fold_importance_df["feature"] = fold_importance_df.index.tolist() fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() del test_df_dmtrx gc.collect() # Full MAEスコアの表示&LINE通知 full_mae = mean_absolute_error(train_df['visitors'], oof_preds) line_notify('XGBoost Full MAE score %.6f' % full_mae) if not debug: # 提出データの予測値を保存 test_df.loc[:,'visitors'] = sub_preds test_df[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t') # out of foldの予測値を保存 train_df.loc[:,'OOF_PRED'] = oof_preds train_df[['index', 'OOF_PRED']].sort_values('index').to_csv(oof_file_name, index= False) return feature_importance_df
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['visitors'].notnull()] test_df = df[df['visitors'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # save pkl save2pkl('../output/train_df.pkl', train_df) save2pkl('../output/test_df.pkl', test_df) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['park_japanese_holiday'])): train_x, train_y = train_df[feats].iloc[train_idx], np.log1p( train_df['visitors'].iloc[train_idx]) valid_x, valid_y = train_df[feats].iloc[valid_idx], np.log1p( train_df['visitors'].iloc[valid_idx]) # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False) # パラメータは適当です params = { 'device': 'gpu', 'gpu_use_dp': True, 'task': 'train', 'boosting': 'goss', 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'num_leaves': 64, 'colsample_bytree': 0.977334338875847, 'subsample': 0.027687793278932, 'max_depth': 20, 'reg_alpha': 9.72886163508719, 'reg_lambda': 9.9935502633216, 'min_split_gain': 0.178508066955524, 'min_child_weight': 43.4750700383884, 'min_data_in_leaf': 18, 'other_rate': 0.925113620582013, 'top_rate': 0.006970683025472, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } reg = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model reg.save_model('../output/lgbm_' + str(n_fold) + '.txt') oof_preds[valid_idx] = np.expm1( reg.predict(valid_x, num_iteration=reg.best_iteration)) sub_preds += np.expm1( reg.predict(test_df[feats], num_iteration=reg.best_iteration)) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d MAE : %.6f' % (n_fold + 1, mean_absolute_error(np.expm1(valid_y), oof_preds[valid_idx]))) del reg, train_x, train_y, valid_x, valid_y gc.collect() # Full MAEスコアの表示&LINE通知 full_mae = mean_absolute_error(train_df['visitors'], oof_preds) line_notify('LigntGBM Full MAE score %.6f' % full_mae) if not debug: # 提出データの予測値を保存 test_df.loc[:, 'visitors'] = sub_preds test_df[['index', 'visitors']].sort_values('index').to_csv(submission_file_name, index=False, header=False, sep='\t') # out of foldの予測値を保存 train_df.loc[:, 'OOF_PRED'] = oof_preds train_df[['index', 'OOF_PRED']].sort_values('index').to_csv(oof_file_name, index=False) return feature_importance_df
test_df, target_col=target_col, model_loss=loss_type, num_folds=folds, feats_exclude=feats_exclude, stratified=False, use_gpu=use_GPU) """ models, model_params, feature_importance_df, train_preds, test_preds, scores, model_name = kfold_xgb( train_df, test_df, target_col=target_col, model_loss=loss_type, num_folds=folds, feats_exclude=feats_exclude, stratified=False, use_gpu=use_GPU) """ # CVスコア create_score_log(scores) score = np.mean(np.array(scores)) line_notify('Full RMSE score %.6f' % score) # submitファイルなどをまとめて保存します. ほんとはもっと疎結合にしてutilに置けるようにしたい... def output(train_df, test_df, models, model_params, feature_importance_df, train_preds, test_preds, scores, now, model_name): score = sum(scores) / len(scores) folder_path = make_output_dir(score, now, model_name) for i, m in enumerate(models): save2pkl('{0}/model_{1:0=2}.pkl'.format(folder_path, i), m) with open('{0}/model_params.json'.format(folder_path), 'w') as f: json.dump(model_params, f, indent=4) with open('{0}/model_valid_scores.json'.format(folder_path), 'w') as f: json.dump({i: s for i, s in enumerate(scores)}, f, indent=4) save_importances(feature_importance_df, '{}/importances.png'.format(folder_path),
def train_lightgbm(train_df,test_df): print('Starting LightGBM. Train shape: {}'.format(train_df.shape)) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # set data structure lgb_train = lgb.Dataset(train_df[feats], label=train_df['demand'], free_raw_data=False) params ={ # 'device' : 'gpu', # 'gpu_use_dp':True, 'boosting': 'gbdt', 'metric': ['rmse'], 'objective':'tweedie', 'learning_rate': 0.05, 'tweedie_variance_power':1.1, 'subsample': 0.5, 'subsample_freq': 1, 'num_leaves': 2**8-1, 'min_data_in_leaf': 2**8-1, 'feature_fraction': 0.8, 'verbose': -1, 'seed':326, 'bagging_seed':326, 'drop_seed':326, 'num_threads':-1 } # train model reg = lgb.train( params, lgb_train, valid_sets=[lgb_train], verbose_eval=10, num_boost_round = int(np.mean(configs['num_boost_round'])), ) # save model reg.save_model('../output/lgbm_weekday.txt') # save predictions oof_preds += reg.predict(train_df[feats], num_iteration=reg.best_iteration) sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) # save feature importances fold_importance_df = pd.DataFrame() fold_importance_df['feature'] = feats fold_importance_df['importance'] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration)) fold_importance_df['fold'] = 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) del reg gc.collect() # Full RMSE score and LINE Notify full_rmse = rmse(train_df['demand'], oof_preds) line_notify('Full RMSE score %.6f' % full_rmse) # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_weekday.png', '../imp/feature_importance_lgbm_weekday.csv') # save out of fold prediction train_df.loc[:,'demand'] = oof_preds train_df = train_df.reset_index() train_df[['id','d','demand']].to_csv(oof_file_name, index=False) # reshape prediction for submit test_df.loc[:,'demand'] = sub_preds test_df = test_df.reset_index() preds = test_df[['id','d','demand']].reset_index() # save csv preds.to_csv(submission_file_name, index=False) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def main(is_eval=False): # load csv df = pd.read_csv('../input/calendar.csv') # to datetime df['date'] = pd.to_datetime(df['date']) # seasonality df['seasonality'] = np.cos(np.pi * (df['date'].dt.dayofyear / 366 * 2 - 1)) # drop string columns df.drop('weekday', axis=1, inplace=True) df['day'] = df['date'].dt.day df['week'] = df['date'].dt.weekofyear df['month'] = df['date'].dt.month df['year'] = df['date'].dt.year df['year'] = (df['year'] - df['year'].min()) df['weekofmonth'] = df['day'].apply(lambda x: ceil(x / 7)) df['dayofweek'] = df['date'].dt.dayofweek df['is_weekend'] = (df['dayofweek'] >= 5).astype(int) # features holiday df['date'] = df['date'].apply(lambda x: x.date()) # to date holidays_us = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(years=y).items(): holidays_us.append(ptr[0]) holidays_ca = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(state='CA', years=y).items(): holidays_ca.append(ptr[0]) holidays_tx = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(state='TX', years=y).items(): holidays_tx.append(ptr[0]) holidays_wi = [] for y in range(2011, 2017): for ptr in holidays.UnitedStates(state='WI', years=y).items(): holidays_wi.append(ptr[0]) df['is_holiday_us'] = df['date'].apply(lambda x: 1 if x in holidays_us else 0) df['is_holiday_ca'] = df['date'].apply(lambda x: 1 if x in holidays_ca else 0) df['is_holiday_tx'] = df['date'].apply(lambda x: 1 if x in holidays_tx else 0) df['is_holiday_wi'] = df['date'].apply(lambda x: 1 if x in holidays_wi else 0) # preprocess event_name_1 # to datetime df['date'] = pd.to_datetime(df['date']) # Moon Phase df['moon'] = df['date'].apply(get_moon_phase) # add ramadan end dates ramadan_end_dates = [ '2011-8-29', '2012-8-18', '2013-8-7', '2014-7-27', '2015-7-16', '2016-7-5' ] for d in ramadan_end_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Ramadan ends' # add Pesach start dates pesach_start_dates = [ '2011-4-18', '2012-4-6', '2013-3-25', '2014-4-14', '2015-4-3', '2016-4-22' ] for d in pesach_start_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Pesach Start' # add purim start dates purim_start_dates = [ '2011-3-19', '2012-3-7', '2013-2-23', '2014-3-15', '2015-3-4', '2016-3-23' ] for d in purim_start_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Purim Start' # add chanukah start dates chanukah_start_dates = [ '2011-12-21', '2012-12-9', '2013-11-28', '2014-12-17', '2015-12-7', '2016-12-25' ] for d in chanukah_start_dates: df.loc[df['date'] == d, 'event_name_1'] = 'Chanukah Start' # add isin features is_nba_final = [] is_lent = [] is_ramadan = [] is_pesach = [] is_purim = [] is_chanukah = [] tmp_nba = 0 tmp_lent = 0 tmp_ramadan = 0 tmp_pesach = 0 tmp_purim = 0 tmp_chanukah = 0 for e in df['event_name_1']: if e == 'NBAFinalsStart': tmp_nba = 1 is_nba_final.append(tmp_nba) if e == 'NBAFinalsEnd': tmp_nba = 0 if e == 'LentStart': tmp_lent = 1 is_lent.append(tmp_lent) if e == 'Easter': tmp_lent = 0 if e == 'Ramadan starts': tmp_ramadan = 1 is_ramadan.append(tmp_ramadan) if e == 'Ramadan ends': tmp_ramadan = 0 if e == 'Pesach Start': tmp_pesach = 1 is_pesach.append(tmp_pesach) if e == 'Pesach End': tmp_pesach = 0 if e == 'Purim Start': tmp_purim = 1 is_purim.append(tmp_purim) if e == 'Purim End': tmp_purim = 0 if e == 'Chanukah Start': tmp_chanukah = 1 is_chanukah.append(tmp_chanukah) if e == 'Chanukah End': tmp_chanukah = 0 df['is_NBA_final'] = is_nba_final df['is_lent'] = is_lent df['is_ramadan'] = is_ramadan df['is_pesach'] = is_pesach df['is_purim'] = is_purim df['is_chanukah'] = is_chanukah # add blackfriday flag blackfriday_dates = [ '2011-11-25', '2012-11-23', '2013-11-29', '2014-11-28', '2015-11-27' ] df['is_blackfriday'] = 0 for d in blackfriday_dates: df.loc[df['date'] == d, 'is_blackfriday'] = 1 # factorize numerical columns cols_string = [ 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2' ] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1, np.nan, inplace=True) # reduce memory usage df = reduce_mem_usage(df) # save pkl save2pkl('../feats/calendar.pkl', df) # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def kfold_lightgbm(train_df, test_df, num_folds, stratified=False, debug=False): print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=326) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0], 12)) sub_preds = np.zeros((test_df.shape[0], 12)) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED] # k-fold for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['click_mode'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'click_mode'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'click_mode'].iloc[valid_idx] # set data structure lgb_train = lgb.Dataset(train_x, label=train_y, categorical_feature=cat_cols, free_raw_data=False) lgb_test = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cat_cols, free_raw_data=False) # params params = { 'device': 'gpu', 'task': 'train', 'boosting': 'gbdt', 'objective': 'multiclass', 'metric': 'multiclass', 'learning_rate': 0.1, 'num_class': 12, 'colsample_bytree': 0.723387165617351, 'max_depth': 8, 'min_child_weight': 42.6805833563236, 'min_data_in_leaf': 34, 'min_split_gain': 0.010945157429729, 'num_leaves': 48, 'reg_alpha': 1.87287994755334, 'reg_lambda': 4.8093341415383, 'subsample': 0.483962708535824, 'verbose': -1, 'seed': int(2**n_fold), 'bagging_seed': int(2**n_fold), 'drop_seed': int(2**n_fold) } clf = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], valid_names=['train', 'test'], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100) # save model clf.save_model('../output/lgbm_queries_{}.txt'.format(n_fold)) oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict( test_df[feats], num_iteration=clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = np.log1p( clf.feature_importance(importance_type='gain', iteration=clf.best_iteration)) fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d F1 Score : %.6f' % (n_fold + 1, f1_score(valid_y, np.argmax(oof_preds[valid_idx], axis=1), average='weighted'))) del clf, train_x, train_y, valid_x, valid_y gc.collect() # Full F1 Score & LINE Notify full_f1 = f1_score(train_df['click_mode'], np.argmax(oof_preds, axis=1), average='weighted') line_notify('Full F1 Score %.6f' % full_f1) # display importances display_importances(feature_importance_df, '../imp/lgbm_importances_queries_profiles.png', '../imp/feature_importance_lgbm_queries_profiles.csv') if not debug: # save prediction for submit sub_preds = pd.DataFrame(sub_preds) sub_preds.columns = [ 'pred_queries_profiles{}'.format(c) for c in sub_preds.columns ] sub_preds['sid'] = test_df.index # save out of fold prediction oof_preds = pd.DataFrame(oof_preds) oof_preds.columns = [ 'pred_queries_profiles{}'.format(c) for c in oof_preds.columns ] oof_preds['sid'] = train_df.index # merge df = oof_preds.append(sub_preds) # save as pkl save2pkl('../features/queries_profiles_pred.pkl', df) line_notify('{} finished.'.format(sys.argv[0]))
def main(num_rows=None): # load csv train_plans = pd.read_csv('../input/data_set_phase2/train_plans_phase2.csv',nrows=num_rows) test_plans = pd.read_csv('../input/data_set_phase2/test_plans.csv',nrows=num_rows) train_clicks = pd.read_csv('../input/data_set_phase2/train_clicks_phase2.csv') # phase 1 csv train_plans1 = pd.read_csv('../input/data_set_phase2/train_plans_phase1.csv') train_clicks1 = pd.read_csv('../input/data_set_phase2/train_clicks_phase1.csv') # merge click train_plans = pd.merge(train_plans, train_clicks[['sid','click_mode']], on='sid', how='left') train_plans1 = pd.merge(train_plans1, train_clicks1[['sid','click_mode']], on='sid', how='left') # merge phase 1 data train_plans = train_plans1.append(train_plans) # fill na (no click) train_plans['click_mode'].fillna(0, inplace=True) # set test target as nan test_plans['click_mode'] = np.nan # merge train & test plans = train_plans.append(test_plans) del train_plans, test_plans, train_plans1, train_clicks, train_clicks1 gc.collect() # reset index plans.reset_index(inplace=True,drop=True) # convert json for key in tqdm(['distance', 'price', 'eta', 'transport_mode']): plans[key] = plans.plans.apply(lambda x: loadJSON(x,key)) # flatten plans_df = [FlattenDataSimple(plans, key) for key in tqdm(['distance', 'price', 'eta', 'transport_mode'])] plans_df = pd.concat(plans_df,axis=1) # merge plan_time & click_mode plans_df = pd.merge(plans_df.reset_index(), plans[['sid','plan_time', 'click_mode']], on='sid',how='outer') del plans gc.collect() # reduce memory usage plans_df = reduce_mem_usage(plans_df) # cleaning for c in plans_df.columns.to_list(): if 'price' in c: plans_df[c] = plans_df[c].replace('',0) plans_df['plan_time'] = pd.to_datetime(plans_df['plan_time']) # datetime features plans_df['plan_weekday'] = plans_df['plan_time'].dt.weekday plans_df['plan_hour'] = plans_df['plan_time'].dt.hour plans_df['plan_is_holiday'] = plans_df['plan_time'].apply(lambda x: is_holiday(x)).astype(int) plans_df['plan_weekday_hour'] = plans_df['plan_weekday'].astype(str)+'_'+plans_df['plan_hour'].astype(str) plans_df['plan_is_holiday_hour'] = plans_df['plan_is_holiday'].astype(str)+'_'+plans_df['plan_hour'].astype(str) plans_df['plan_time_diff'] = plans_df.index.map(plans_df.sort_values('plan_time')['plan_time'].diff().dt.seconds) # factorize plans_df['plan_weekday_hour'], _ = pd.factorize(plans_df['plan_weekday_hour']) plans_df['plan_is_holiday_hour'], _ = pd.factorize(plans_df['plan_is_holiday_hour']) # count features plans_df['plan_weekday_count'] = plans_df['plan_weekday'].map(plans_df['plan_weekday'].value_counts()) plans_df['plan_hour_count'] = plans_df['plan_hour'].map(plans_df['plan_hour'].value_counts()) plans_df['plan_weekday_hour_count'] = plans_df['plan_weekday_hour'].map(plans_df['plan_weekday_hour'].value_counts()) plans_df['plan_is_holiday_hour_count'] = plans_df['plan_is_holiday_hour'].map(plans_df['plan_is_holiday_hour'].value_counts()) # stats features cols_transport_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)] cols_distance = ['plan_{}_distance'.format(i) for i in range(0,7)] cols_price = ['plan_{}_price'.format(i) for i in range(0,7)] cols_eta = ['plan_{}_eta'.format(i) for i in range(0,7)] plans_df['plan_distance_mean'] = plans_df[cols_distance].mean(axis=1) plans_df['plan_distance_sum'] = plans_df[cols_distance].sum(axis=1) plans_df['plan_distance_max'] = plans_df[cols_distance].max(axis=1) plans_df['plan_distance_min'] = plans_df[cols_distance].min(axis=1) plans_df['plan_distance_var'] = plans_df[cols_distance].var(axis=1) plans_df['plan_distance_skew'] = plans_df[cols_distance].skew(axis=1) plans_df['plan_price_mean'] = plans_df[cols_price].mean(axis=1) plans_df['plan_price_sum'] = plans_df[cols_price].sum(axis=1) plans_df['plan_price_max'] = plans_df[cols_price].max(axis=1) plans_df['plan_price_min'] = plans_df[cols_price].min(axis=1) plans_df['plan_price_var'] = plans_df[cols_price].var(axis=1) plans_df['plan_price_skew'] = plans_df[cols_price].skew(axis=1) plans_df['plan_eta_mean'] = plans_df[cols_eta].mean(axis=1) plans_df['plan_eta_sum'] = plans_df[cols_eta].sum(axis=1) plans_df['plan_eta_max'] = plans_df[cols_eta].max(axis=1) plans_df['plan_eta_min'] = plans_df[cols_eta].min(axis=1) plans_df['plan_eta_var'] = plans_df[cols_eta].var(axis=1) plans_df['plan_eta_skew'] = plans_df[cols_eta].skew(axis=1) # min-max plan (categorical) plans_df['plan_distance_max_plan'] = plans_df[cols_distance].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_min_plan'] = plans_df[cols_distance].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_max_plan'] = plans_df[cols_price].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_min_plan'] = plans_df[cols_price].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_eta_max_plan'] = plans_df[cols_eta].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_eta_min_plan'] = plans_df[cols_eta].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) # map plans cols_min_max_plan = ['plan_distance_max_plan','plan_distance_min_plan', 'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan', 'plan_eta_min_plan'] for c in tqdm(cols_transport_mode): for p in cols_min_max_plan: plans_df[p][plans_df[p]==c] = plans_df[c][plans_df[p]==c] # count features plans_df['plan_distance_max_plan_count'] = plans_df['plan_distance_max_plan'].map(plans_df['plan_distance_max_plan'].value_counts()) plans_df['plan_distance_min_plan_count'] = plans_df['plan_distance_min_plan'].map(plans_df['plan_distance_min_plan'].value_counts()) plans_df['plan_price_max_plan_count'] = plans_df['plan_price_max_plan'].map(plans_df['plan_price_max_plan'].value_counts()) plans_df['plan_price_min_plan_count'] = plans_df['plan_price_min_plan'].map(plans_df['plan_price_min_plan'].value_counts()) plans_df['plan_eta_max_plan_count'] = plans_df['plan_eta_max_plan'].map(plans_df['plan_eta_max_plan'].value_counts()) plans_df['plan_eta_min_plan_count'] = plans_df['plan_eta_min_plan'].map(plans_df['plan_eta_min_plan'].value_counts()) # count features cols_mode = ['plan_{}_transport_mode'.format(i) for i in range(0,7)] cols_mode_count = [] for c in cols_mode: plans_df[c+'_count'] = plans_df[c].map(plans_df[c].value_counts()) cols_mode_count.append(c+'_count') # number features plans_df['plan_num_plans'] = plans_df[cols_mode].notnull().sum(axis=1) plans_df['plan_num_free_plans'] = (plans_df[cols_price]==0).sum(axis=1) # rank features plans_df[[ c +'_rank' for c in cols_distance]] = plans_df[cols_distance].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price]] = plans_df[cols_price].rank(axis=1) plans_df[[ c +'_rank' for c in cols_eta]] = plans_df[cols_eta].rank(axis=1) plans_df[[ c +'_rank' for c in cols_mode_count]] = plans_df[cols_mode_count].rank(axis=1) # ratio features for i in range(0,7): plans_df['plan_{}_price_distance_ratio'.format(i)] = plans_df['plan_{}_price'.format(i)] / plans_df['plan_{}_distance'.format(i)] plans_df['plan_{}_price_eta_ratio'.format(i)] = plans_df['plan_{}_price'.format(i)] / plans_df['plan_{}_eta'.format(i)] plans_df['plan_{}_distance_eta_ratio'.format(i)] = plans_df['plan_{}_distance'.format(i)] / plans_df['plan_{}_eta'.format(i)] # prod features for i in range(0,7): plans_df['plan_{}_price_distance_prod'.format(i)] = plans_df['plan_{}_price'.format(i)] * plans_df['plan_{}_distance'.format(i)] plans_df['plan_{}_price_eta_prod'.format(i)] = plans_df['plan_{}_price'.format(i)] * plans_df['plan_{}_eta'.format(i)] plans_df['plan_{}_distance_eta_prod'.format(i)] = plans_df['plan_{}_distance'.format(i)] * plans_df['plan_{}_eta'.format(i)] plans_df['plan_{}_price_distance_eta_prod'.format(i)] = plans_df['plan_{}_price'.format(i)] * plans_df['plan_{}_distance'.format(i)]* plans_df['plan_{}_eta'.format(i)] # ratio features with plan 0 for i in range(1,7): plans_df['plan_{}_distance_ratio_0'.format(i)] = plans_df['plan_{}_distance'.format(i)]/plans_df['plan_0_distance'] plans_df['plan_{}_price_ratio_0'.format(i)] = plans_df['plan_{}_price'.format(i)]/plans_df['plan_0_price'] plans_df['plan_{}_eta_ratio_0'.format(i)] = plans_df['plan_{}_eta'.format(i)]/plans_df['plan_0_eta'] plans_df['plan_{}_price_distance_prod_ratio_0'.format(i)] = plans_df['plan_{}_price_distance_prod'.format(i)] / plans_df['plan_0_price_distance_prod'] plans_df['plan_{}_price_eta_prod_ratio_0'.format(i)] = plans_df['plan_{}_price_eta_prod'.format(i)] / plans_df['plan_0_price_eta_prod'] plans_df['plan_{}_distance_eta_prod_ratio_0'.format(i)] = plans_df['plan_{}_distance_eta_prod'.format(i)] / plans_df['plan_0_distance_eta_prod'] plans_df['plan_{}_price_distance_eta_prod_ratio_0'.format(i)] = plans_df['plan_{}_price_distance_eta_prod'.format(i)] / plans_df['plan_0_price_distance_eta_prod'] # stats features of ratio cols_price_distance_ratio = ['plan_{}_price_distance_ratio'.format(i) for i in range(0,7)] cols_price_eta_ratio = ['plan_{}_price_eta_ratio'.format(i) for i in range(0,7)] cols_distance_eta_ratio = ['plan_{}_distance_eta_ratio'.format(i) for i in range(0,7)] cols_price_distance_prod = ['plan_{}_price_distance_prod'.format(i) for i in range(0,7)] cols_price_eta_prod = ['plan_{}_price_eta_prod'.format(i) for i in range(0,7)] cols_distance_eta_prod = ['plan_{}_distance_eta_prod'.format(i) for i in range(0,7)] cols_price_distance_eta_prod = ['plan_{}_price_distance_eta_prod'.format(i) for i in range(0,7)] cols_distance_ratio_0 = ['plan_{}_distance_ratio_0'.format(i) for i in range(1,7)] cols_price_ratio_0 = ['plan_{}_price_ratio_0'.format(i) for i in range(1,7)] cols_eta_ratio_0 = ['plan_{}_eta_ratio_0'.format(i) for i in range(1,7)] cols_price_distance_prod_ratio_0 = ['plan_{}_price_distance_prod_ratio_0'.format(i) for i in range(1,7)] cols_price_eta_prod_ratio_0 = ['plan_{}_price_eta_prod_ratio_0'.format(i) for i in range(1,7)] cols_distance_eta_prod_ratio_0 = ['plan_{}_distance_eta_prod_ratio_0'.format(i) for i in range(1,7)] cols_price_distance_eta_prod_ratio_0 = ['plan_{}_price_distance_eta_prod_ratio_0'.format(i) for i in range(1,7)] plans_df['plan_price_distance_ratio_mean'] = plans_df[cols_price_distance_ratio].mean(axis=1) plans_df['plan_price_distance_ratio_sum'] = plans_df[cols_price_distance_ratio].sum(axis=1) plans_df['plan_price_distance_ratio_max'] = plans_df[cols_price_distance_ratio].max(axis=1) plans_df['plan_price_distance_ratio_min'] = plans_df[cols_price_distance_ratio].min(axis=1) plans_df['plan_price_distance_ratio_var'] = plans_df[cols_price_distance_ratio].var(axis=1) plans_df['plan_price_distance_ratio_skew'] = plans_df[cols_price_distance_ratio].skew(axis=1) plans_df['plan_price_eta_ratio_mean'] = plans_df[cols_price_eta_ratio].mean(axis=1) plans_df['plan_price_eta_ratio_sum'] = plans_df[cols_price_eta_ratio].sum(axis=1) plans_df['plan_price_eta_ratio_max'] = plans_df[cols_price_eta_ratio].max(axis=1) plans_df['plan_price_eta_ratio_min'] = plans_df[cols_price_eta_ratio].min(axis=1) plans_df['plan_price_eta_ratio_var'] = plans_df[cols_price_eta_ratio].var(axis=1) plans_df['plan_price_eta_ratio_skew'] = plans_df[cols_price_eta_ratio].skew(axis=1) plans_df['plan_distance_eta_ratio_mean'] = plans_df[cols_distance_eta_ratio].mean(axis=1) plans_df['plan_distance_eta_ratio_sum'] = plans_df[cols_distance_eta_ratio].sum(axis=1) plans_df['plan_distance_eta_ratio_max'] = plans_df[cols_distance_eta_ratio].max(axis=1) plans_df['plan_distance_eta_ratio_min'] = plans_df[cols_distance_eta_ratio].min(axis=1) plans_df['plan_distance_eta_ratio_var'] = plans_df[cols_distance_eta_ratio].var(axis=1) plans_df['plan_distance_eta_ratio_skew'] = plans_df[cols_distance_eta_ratio].skew(axis=1) plans_df['plan_price_distance_prod_mean'] = plans_df[cols_price_distance_prod].mean(axis=1) plans_df['plan_price_distance_prod_sum'] = plans_df[cols_price_distance_prod].sum(axis=1) plans_df['plan_price_distance_prod_max'] = plans_df[cols_price_distance_prod].max(axis=1) plans_df['plan_price_distance_prod_min'] = plans_df[cols_price_distance_prod].min(axis=1) plans_df['plan_price_distance_prod_var'] = plans_df[cols_price_distance_prod].var(axis=1) plans_df['plan_price_distance_prod_skew'] = plans_df[cols_price_distance_prod].skew(axis=1) plans_df['plan_price_eta_prod_mean'] = plans_df[cols_price_eta_prod].mean(axis=1) plans_df['plan_price_eta_prod_sum'] = plans_df[cols_price_eta_prod].sum(axis=1) plans_df['plan_price_eta_prod_max'] = plans_df[cols_price_eta_prod].max(axis=1) plans_df['plan_price_eta_prod_min'] = plans_df[cols_price_eta_prod].min(axis=1) plans_df['plan_price_eta_prod_var'] = plans_df[cols_price_eta_prod].var(axis=1) plans_df['plan_price_eta_prod_skew'] = plans_df[cols_price_eta_prod].skew(axis=1) plans_df['plan_distance_eta_prod_mean'] = plans_df[cols_distance_eta_prod].mean(axis=1) plans_df['plan_distance_eta_prod_sum'] = plans_df[cols_distance_eta_prod].sum(axis=1) plans_df['plan_distance_eta_prod_max'] = plans_df[cols_distance_eta_prod].max(axis=1) plans_df['plan_distance_eta_prod_min'] = plans_df[cols_distance_eta_prod].min(axis=1) plans_df['plan_distance_eta_prod_var'] = plans_df[cols_distance_eta_prod].var(axis=1) plans_df['plan_distance_eta_prod_skew'] = plans_df[cols_distance_eta_prod].skew(axis=1) plans_df['plan_price_distance_eta_prod_mean'] = plans_df[cols_price_distance_eta_prod].mean(axis=1) plans_df['plan_price_distance_eta_prod_sum'] = plans_df[cols_price_distance_eta_prod].sum(axis=1) plans_df['plan_price_distance_eta_prod_max'] = plans_df[cols_price_distance_eta_prod].max(axis=1) plans_df['plan_price_distance_eta_prod_min'] = plans_df[cols_price_distance_eta_prod].min(axis=1) plans_df['plan_price_distance_eta_prod_var'] = plans_df[cols_price_distance_eta_prod].var(axis=1) plans_df['plan_price_distance_eta_prod_skew'] = plans_df[cols_price_distance_eta_prod].skew(axis=1) plans_df['plan_distance_ratio_0_mean'] = plans_df[cols_distance_ratio_0].mean(axis=1) plans_df['plan_distance_ratio_0_sum'] = plans_df[cols_distance_ratio_0].sum(axis=1) plans_df['plan_distance_ratio_0_max'] = plans_df[cols_distance_ratio_0].max(axis=1) plans_df['plan_distance_ratio_0_min'] = plans_df[cols_distance_ratio_0].min(axis=1) plans_df['plan_distance_ratio_0_var'] = plans_df[cols_distance_ratio_0].var(axis=1) plans_df['plan_distance_ratio_0_skew'] = plans_df[cols_distance_ratio_0].skew(axis=1) plans_df['plan_price_ratio_0_mean'] = plans_df[cols_price_ratio_0].mean(axis=1) plans_df['plan_price_ratio_0_sum'] = plans_df[cols_price_ratio_0].sum(axis=1) plans_df['plan_price_ratio_0_max'] = plans_df[cols_price_ratio_0].max(axis=1) plans_df['plan_price_ratio_0_min'] = plans_df[cols_price_ratio_0].min(axis=1) plans_df['plan_price_ratio_0_var'] = plans_df[cols_price_ratio_0].var(axis=1) plans_df['plan_price_ratio_0_skew'] = plans_df[cols_price_ratio_0].skew(axis=1) plans_df['plan_eta_ratio_0_mean'] = plans_df[cols_eta_ratio_0].mean(axis=1) plans_df['plan_eta_ratio_0_sum'] = plans_df[cols_eta_ratio_0].sum(axis=1) plans_df['plan_eta_ratio_0_max'] = plans_df[cols_eta_ratio_0].max(axis=1) plans_df['plan_eta_ratio_0_min'] = plans_df[cols_eta_ratio_0].min(axis=1) plans_df['plan_eta_ratio_0_var'] = plans_df[cols_eta_ratio_0].var(axis=1) plans_df['plan_eta_ratio_0_skew'] = plans_df[cols_eta_ratio_0].skew(axis=1) plans_df['plan_price_distance_prod_ratio_0_mean'] = plans_df[cols_price_distance_prod_ratio_0].mean(axis=1) plans_df['plan_price_distance_prod_ratio_0_sum'] = plans_df[cols_price_distance_prod_ratio_0].sum(axis=1) plans_df['plan_price_distance_prod_ratio_0_max'] = plans_df[cols_price_distance_prod_ratio_0].max(axis=1) plans_df['plan_price_distance_prod_ratio_0_min'] = plans_df[cols_price_distance_prod_ratio_0].min(axis=1) plans_df['plan_price_distance_prod_ratio_0_var'] = plans_df[cols_price_distance_prod_ratio_0].var(axis=1) plans_df['plan_price_distance_prod_ratio_0_skew'] = plans_df[cols_price_distance_prod_ratio_0].skew(axis=1) plans_df['plan_price_eta_prod_ratio_0_mean'] = plans_df[cols_price_eta_prod_ratio_0].mean(axis=1) plans_df['plan_price_eta_prod_ratio_0_sum'] = plans_df[cols_price_eta_prod_ratio_0].sum(axis=1) plans_df['plan_price_eta_prod_ratio_0_max'] = plans_df[cols_price_eta_prod_ratio_0].max(axis=1) plans_df['plan_price_eta_prod_ratio_0_min'] = plans_df[cols_price_eta_prod_ratio_0].min(axis=1) plans_df['plan_price_eta_prod_ratio_0_var'] = plans_df[cols_price_eta_prod_ratio_0].var(axis=1) plans_df['plan_price_eta_prod_ratio_0_skew'] = plans_df[cols_price_eta_prod_ratio_0].skew(axis=1) plans_df['plan_distance_eta_prod_ratio_0_mean'] = plans_df[cols_distance_eta_prod_ratio_0].mean(axis=1) plans_df['plan_distance_eta_prod_ratio_0_sum'] = plans_df[cols_distance_eta_prod_ratio_0].sum(axis=1) plans_df['plan_distance_eta_prod_ratio_0_max'] = plans_df[cols_distance_eta_prod_ratio_0].max(axis=1) plans_df['plan_distance_eta_prod_ratio_0_min'] = plans_df[cols_distance_eta_prod_ratio_0].min(axis=1) plans_df['plan_distance_eta_prod_ratio_0_var'] = plans_df[cols_distance_eta_prod_ratio_0].var(axis=1) plans_df['plan_distance_eta_prod_ratio_0_skew'] = plans_df[cols_distance_eta_prod_ratio_0].skew(axis=1) plans_df['plan_price_distance_eta_prod_ratio_0_mean'] = plans_df[cols_price_distance_eta_prod_ratio_0].mean(axis=1) plans_df['plan_price_distance_eta_prod_ratio_0_sum'] = plans_df[cols_price_distance_eta_prod_ratio_0].sum(axis=1) plans_df['plan_price_distance_eta_prod_ratio_0_max'] = plans_df[cols_price_distance_eta_prod_ratio_0].max(axis=1) plans_df['plan_price_distance_eta_prod_ratio_0_min'] = plans_df[cols_price_distance_eta_prod_ratio_0].min(axis=1) plans_df['plan_price_distance_eta_prod_ratio_0_var'] = plans_df[cols_price_distance_eta_prod_ratio_0].var(axis=1) plans_df['plan_price_distance_eta_prod_ratio_0_skew'] = plans_df[cols_price_distance_eta_prod_ratio_0].skew(axis=1) # rank features plans_df[[ c +'_rank' for c in cols_price_distance_ratio]] = plans_df[cols_price_distance_ratio].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_eta_ratio]] = plans_df[cols_price_eta_ratio].rank(axis=1) plans_df[[ c +'_rank' for c in cols_distance_eta_ratio]] = plans_df[cols_distance_eta_ratio].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_distance_prod]] = plans_df[cols_price_distance_prod].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_eta_prod]] = plans_df[cols_price_eta_prod].rank(axis=1) plans_df[[ c +'_rank' for c in cols_distance_eta_prod]] = plans_df[cols_distance_eta_prod].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_distance_eta_prod]] = plans_df[cols_price_distance_eta_prod].rank(axis=1) plans_df[[ c +'_rank' for c in cols_distance_ratio_0]] = plans_df[cols_distance_ratio_0].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_ratio_0]] = plans_df[cols_price_ratio_0].rank(axis=1) plans_df[[ c +'_rank' for c in cols_eta_ratio_0]] = plans_df[cols_eta_ratio_0].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_distance_prod_ratio_0]] = plans_df[cols_price_distance_prod_ratio_0].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_eta_prod_ratio_0]] = plans_df[cols_price_eta_prod_ratio_0].rank(axis=1) plans_df[[ c +'_rank' for c in cols_distance_eta_prod_ratio_0]] = plans_df[cols_distance_eta_prod_ratio_0].rank(axis=1) plans_df[[ c +'_rank' for c in cols_price_distance_eta_prod_ratio_0]] = plans_df[cols_price_distance_eta_prod_ratio_0].rank(axis=1) # min-max plan (categorical) for ratio features plans_df['plan_price_distance_ratio_max_plan'] = plans_df[cols_price_distance_ratio].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_eta_ratio_max_plan'] = plans_df[cols_price_eta_ratio].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode') plans_df['plan_price_distance_ratio_min_plan'] = plans_df[cols_price_distance_ratio].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_eta_ratio_min_plan'] = plans_df[cols_price_eta_ratio].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_eta_ratio_max_plan'] = plans_df[cols_distance_eta_ratio].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_eta_ratio_min_plan'] = plans_df[cols_distance_eta_ratio].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_prod_max_plan'] = plans_df[cols_price_distance_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_eta_prod_max_plan'] = plans_df[cols_price_eta_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode') plans_df['plan_price_distance_prod_min_plan'] = plans_df[cols_price_distance_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_eta_prod_min_plan'] = plans_df[cols_price_eta_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_eta_prod_max_plan'] = plans_df[cols_distance_eta_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_eta_prod_min_plan'] = plans_df[cols_distance_eta_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_eta_prod_max_plan'] = plans_df[cols_distance_eta_prod].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_eta_prod_min_plan'] = plans_df[cols_distance_eta_prod].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_ratio_0_max_plan'] = plans_df[cols_distance_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_ratio_0_min_plan'] = plans_df[cols_distance_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_ratio_0_max_plan'] = plans_df[cols_price_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_ratio_0_min_plan'] = plans_df[cols_price_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_eta_ratio_0_max_plan'] = plans_df[cols_eta_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_eta_ratio_0_min_plan'] = plans_df[cols_eta_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_prod_ratio_0_max_plan'] = plans_df[cols_price_distance_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_prod_ratio_0_min_plan'] = plans_df[cols_price_distance_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_eta_prod_ratio_0_max_plan'] = plans_df[cols_price_eta_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_eta_prod_ratio_0_min_plan'] = plans_df[cols_price_eta_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_eta_prod_ratio_0_max_plan'] = plans_df[cols_distance_eta_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_distance_eta_prod_ratio_0_min_plan'] = plans_df[cols_distance_eta_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_eta_prod_ratio_0_max_plan'] = plans_df[cols_price_distance_eta_prod_ratio_0].idxmax(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) plans_df['plan_price_distance_eta_prod_ratio_0_min_plan'] = plans_df[cols_price_distance_eta_prod_ratio_0].idxmin(axis=1).apply(lambda x: x[:6]+'_transport_mode' if type(x)==str else np.nan) # map plans cols_ratio_plan = ['plan_price_distance_ratio_max_plan','plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan','plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan', 'plan_distance_eta_ratio_min_plan', 'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan', 'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan', 'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan', 'plan_price_distance_eta_prod_max_plan', 'plan_price_distance_eta_prod_min_plan', 'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan', 'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan', 'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan', 'plan_price_distance_prod_ratio_0_max_plan','plan_price_distance_prod_ratio_0_min_plan', 'plan_price_eta_prod_ratio_0_max_plan','plan_price_eta_prod_ratio_0_min_plan', 'plan_distance_eta_prod_ratio_0_max_plan', 'plan_distance_eta_prod_ratio_0_min_plan', 'plan_price_distance_eta_prod_ratio_0_max_plan','plan_price_distance_eta_prod_ratio_0_min_plan'] for p in tqdm(cols_ratio_plan): for c in cols_transport_mode: plans_df[p][plans_df[p]==c] = plans_df[c][plans_df[p]==c] # count features plans_df['plan_price_distance_ratio_max_plan_count'] = plans_df['plan_price_distance_ratio_max_plan'].map(plans_df['plan_price_distance_ratio_max_plan'].value_counts()) plans_df['plan_price_distance_ratio_min_plan_count'] = plans_df['plan_price_distance_ratio_min_plan'].map(plans_df['plan_price_distance_ratio_min_plan'].value_counts()) plans_df['plan_price_eta_ratio_max_plan_count'] = plans_df['plan_price_eta_ratio_max_plan'].map(plans_df['plan_price_eta_ratio_max_plan'].value_counts()) plans_df['plan_price_eta_ratio_min_plan_count'] = plans_df['plan_price_eta_ratio_min_plan'].map(plans_df['plan_price_eta_ratio_min_plan'].value_counts()) plans_df['plan_distance_eta_ratio_max_plan_count'] = plans_df['plan_distance_eta_ratio_max_plan'].map(plans_df['plan_distance_eta_ratio_max_plan'].value_counts()) plans_df['plan_distance_eta_ratio_min_plan_count'] = plans_df['plan_distance_eta_ratio_min_plan'].map(plans_df['plan_distance_eta_ratio_min_plan'].value_counts()) plans_df['plan_price_distance_prod_max_plan_count'] = plans_df['plan_price_distance_prod_max_plan'].map(plans_df['plan_price_distance_prod_max_plan'].value_counts()) plans_df['plan_price_distance_prod_min_plan_count'] = plans_df['plan_price_distance_prod_min_plan'].map(plans_df['plan_price_distance_prod_min_plan'].value_counts()) plans_df['plan_price_eta_prod_max_plan_count'] = plans_df['plan_price_eta_prod_max_plan'].map(plans_df['plan_price_eta_prod_max_plan'].value_counts()) plans_df['plan_price_eta_prod_min_plan_count'] = plans_df['plan_price_eta_prod_min_plan'].map(plans_df['plan_price_eta_prod_min_plan'].value_counts()) plans_df['plan_distance_eta_prod_max_plan_count'] = plans_df['plan_distance_eta_prod_max_plan'].map(plans_df['plan_distance_eta_prod_max_plan'].value_counts()) plans_df['plan_distance_eta_prod_min_plan_count'] = plans_df['plan_distance_eta_prod_min_plan'].map(plans_df['plan_distance_eta_prod_min_plan'].value_counts()) plans_df['plan_price_distance_eta_prod_max_plan_count'] = plans_df['plan_price_distance_eta_prod_max_plan'].map(plans_df['plan_price_distance_eta_prod_max_plan'].value_counts()) plans_df['plan_price_distance_eta_prod_min_plan_count'] = plans_df['plan_price_distance_eta_prod_min_plan'].map(plans_df['plan_price_distance_eta_prod_min_plan'].value_counts()) plans_df['plan_distance_ratio_0_max_plan_count'] = plans_df['plan_distance_ratio_0_max_plan'].map(plans_df['plan_distance_ratio_0_max_plan'].value_counts()) plans_df['plan_distance_ratio_0_min_plan_count'] = plans_df['plan_distance_ratio_0_min_plan'].map(plans_df['plan_distance_ratio_0_min_plan'].value_counts()) plans_df['plan_price_ratio_0_max_plan_count'] = plans_df['plan_price_ratio_0_max_plan'].map(plans_df['plan_price_ratio_0_max_plan'].value_counts()) plans_df['plan_price_ratio_0_min_plan_count'] = plans_df['plan_price_ratio_0_min_plan'].map(plans_df['plan_price_ratio_0_min_plan'].value_counts()) plans_df['plan_eta_ratio_0_max_plan_count'] = plans_df['plan_eta_ratio_0_max_plan'].map(plans_df['plan_eta_ratio_0_max_plan'].value_counts()) plans_df['plan_eta_ratio_0_min_plan_count'] = plans_df['plan_eta_ratio_0_min_plan'].map(plans_df['plan_eta_ratio_0_min_plan'].value_counts()) plans_df['plan_price_distance_prod_ratio_0_max_plan_count'] = plans_df['plan_price_distance_prod_ratio_0_max_plan'].map(plans_df['plan_price_distance_prod_ratio_0_max_plan'].value_counts()) plans_df['plan_price_distance_prod_ratio_0_min_plan_count'] = plans_df['plan_price_distance_prod_ratio_0_min_plan'].map(plans_df['plan_price_distance_prod_ratio_0_min_plan'].value_counts()) plans_df['plan_price_eta_prod_ratio_0_max_plan_count'] = plans_df['plan_price_eta_prod_ratio_0_max_plan'].map(plans_df['plan_price_eta_prod_ratio_0_max_plan'].value_counts()) plans_df['plan_price_eta_prod_ratio_0_min_plan_count'] = plans_df['plan_price_eta_prod_ratio_0_min_plan'].map(plans_df['plan_price_eta_prod_ratio_0_min_plan'].value_counts()) plans_df['plan_distance_eta_prod_ratio_0_max_plan_count'] = plans_df['plan_distance_eta_prod_ratio_0_max_plan'].map(plans_df['plan_distance_eta_prod_ratio_0_max_plan'].value_counts()) plans_df['plan_distance_eta_prod_ratio_0_min_plan_count'] = plans_df['plan_distance_eta_prod_ratio_0_min_plan'].map(plans_df['plan_distance_eta_prod_ratio_0_min_plan'].value_counts()) plans_df['plan_price_distance_eta_prod_ratio_0_max_plan_count'] = plans_df['plan_price_distance_eta_prod_ratio_0_max_plan'].map(plans_df['plan_price_distance_eta_prod_ratio_0_max_plan'].value_counts()) plans_df['plan_price_distance_eta_prod_ratio_0_min_plan_count'] = plans_df['plan_price_distance_eta_prod_ratio_0_min_plan'].map(plans_df['plan_price_distance_eta_prod_ratio_0_min_plan'].value_counts()) # save as pkl to_pickles(plans_df, '../features/plans', split_size=5) line_notify('{} finished.'.format(sys.argv[0]))