models = [] for i in range(3): bst = lgb.Booster(model_file=f'lgb{i}.model') models.append(bst) imp = ex.getImp(models) """ # ============================================================================= # test # ============================================================================= sub = pd.read_pickle('../data/sub.p') dtest = utils.read_pickles('../data/dtest') gc.collect() sub['is_attributed'] = 0 for model in models: y_pred = model.predict(dtest) sub['is_attributed'] += pd.Series(y_pred).rank() sub['is_attributed'] /= LOOP sub['is_attributed'] /= sub['is_attributed'].max() sub['click_id'] = sub.click_id.map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # =============================================================================
import os import pandas as pd from tqdm import tqdm import gc from glob import glob from multiprocessing import Pool proc = 5 from itertools import combinations import utils utils.start(__file__) os.system('rm -rf ../data/104__*.p') trte = pd.concat([ utils.read_pickles('../data/train'), utils.read_pickles('../data/test_old') ]) #trte['day'] = trte.click_time.dt.day #trte['hour'] = trte.click_time.dt.hour gc.collect() def multi(keys): gc.collect() print(keys) keys1, keys2 = keys df = trte.groupby(keys1).size().groupby(keys2).size().rank(method='dense')
Created on Fri May 26 17:07:09 2017 @author: konodera """ import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) col = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev' ] log = utils.read_pickles('../input/mk/log', col).rename(columns={'reordered': 'y'}) test_order = pd.read_pickle('../input/mk/test_user.p') #============================================================================== # train #============================================================================== def make(T): label_t1 = log[log.order_number_rev > T] label_t1.drop_duplicates(['user_id', 'product_id'], keep='last', inplace=True) label_t1.sort_values(['user_id', 'product_id'], inplace=True) label_t0_y1 = log.loc[log.order_number_rev == T].loc[log.y == 1]
""" import gc import pandas as pd import sys sys.path.append('/home/kazuki_onodera/Python') from sklearn.preprocessing import LabelEncoder import lightgbm as lgb import multiprocessing import utils utils.start(__file__) #============================================================================== SEED = 71 X = utils.read_pickles('../data/101_train') y = utils.read_pickles('../data/label').TARGET if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') param = { 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.05, 'max_depth': -1, 'num_leaves': 511, 'max_bin': 511, 'colsample_bytree': 0.5,
kfold = 10 X_base = pd.read_pickle('../feature/X_base_t3.p') label_train = pd.read_pickle('../feature/trainT-0/label_reordered.p') label_test = pd.read_pickle('../feature/test/label_reordered.p') train = pd.merge(X_base[X_base.is_train==1], label_train, on='order_id', how='inner') test = pd.merge(X_base[X_base.is_train==0], label_test, on='order_id', how='inner') #============================================================================== # mk train * test log #============================================================================== col = ['order_id', 'user_id', 'product_id'] train_log = utils.read_pickles('../input/mk/log', col) order_tbl = pd.read_pickle('../input/mk/order_tbl.p')\ [['order_id', 'user_id', 'order_number', 'days_since_first_order']] # merge user_id -> ['order_id', 'user_id', 'product_id'] train_log = pd.merge(train_log[['order_id', 'product_id']], order_tbl[['order_id','user_id']], on='order_id', how='left')[['order_id', 'user_id', 'product_id']] test_log = pd.merge(test[['order_id', 'product_id']], order_tbl[['order_id','user_id']], on='order_id', how='left')[['order_id', 'user_id', 'product_id']] log = pd.concat([train_log, test_log]) del X_base, train_log, test_log; gc.collect() log.sort_values(['user_id', 'product_id'], inplace=True)
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) y_train = utils.read_pickles('../data/label').TARGET # ============================================================================= # remove old users # ============================================================================= X_train['SK_ID_CURR'] = SK_ID_CURR y_train = y_train[~X_train.SK_ID_CURR.isin(drop_ids)] X_train = X_train[~X_train.SK_ID_CURR.isin(drop_ids)] oof_train = X_train[['SK_ID_CURR']] X_train.drop('SK_ID_CURR', axis=1, inplace=True) X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) COL = X_train.columns.tolist() # test X_test = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean}" print(result) utils.send_line(result) y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) # save oof_train['oof'] = y_pred oof_train.to_csv('../output/onodera-last-oof.csv', index=False) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
PREF = 'f106_' KEY = 'SK_ID_CURR' col_binary = [ 'NAME_CONTRACT_TYPE', 'NAME_CONTRACT_STATUS', 'CODE_REJECT_REASON', 'NAME_YIELD_GROUP', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'NAME_SELLER_INDUSTRY', 'CHANNEL_TYPE', 'NAME_PAYMENT_TYPE' ] os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # # ============================================================================= prev = utils.read_pickles('../data/previous_application', ['SK_ID_CURR', 'DAYS_DECISION'] + col_binary) #base = prev[[KEY]].drop_duplicates().set_index(KEY) prev.sort_values(['SK_ID_CURR', 'DAYS_DECISION'], inplace=True) # top is latest col_binary_di = {} for c in col_binary: col_binary_di[c] = list(prev[c].unique()) ids = prev.SK_ID_CURR.unique() def to_decimal(x): if len(x) == 0:
userのそのitemのcycle """ import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) #============================================================================== # load #============================================================================== usecols = [ 'order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] log = pd.merge(utils.read_pickles('../input/mk/log', usecols), utils.read_pickles('../input/mk/days_since_last_order'), on=['order_id','product_id'], how='left') #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T==-1: folder = 'test' else:
""" import os import pandas as pd from tqdm import tqdm import gc from glob import glob from multiprocessing import Pool import utils utils.start(__file__) # ============================================================================= # load # ============================================================================= train = pd.concat([utils.read_pickles('../data/train', ['ip', 'app', 'device', 'os', 'channel']), utils.read_pickles('../data/004_train', ['timestamp'])], axis=1) test = pd.concat([utils.read_pickles('../data/test_old', ['ip', 'app', 'device', 'os', 'channel']), utils.read_pickles('../data/004_test', ['timestamp'])], axis=1) gc.collect() trte = pd.concat([train, test]) del train, test; gc.collect() # ============================================================================= # features # =============================================================================
def main(num_rows=None): # load pkls df = read_pickles('../features/plans') queries = loadpkl('../features/queries.pkl') profiles = loadpkl('../features/profiles.pkl') queries_pred = loadpkl('../features/queries_pred.pkl') queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl') # merge df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left') df = pd.merge(df, profiles, on='pid', how='left') df = pd.merge(df, queries_pred, on='sid', how='left') df = pd.merge(df, queries_profiles_pred, on='sid', how='left') del queries, profiles, queries_pred, queries_profiles_pred gc.collect() # reduce memory usage df = reduce_mem_usage(df) # count features df['pid_count'] = df['pid'].map(df['pid'].value_counts()) # time diff df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int) # distance ratio cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)] for i, c in enumerate(cols_plan_distance): df['plan_queries_distance_ratio{}'.format( i)] = df[c] / df['queries_distance'] df['plan_queries_distance_diff{}'.format( i)] = df[c] - df['queries_distance'] # stats features for preds cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)] cols_pred_queries_profiles = [ 'pred_queries_profiles{}'.format(i) for i in range(0, 12) ] df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1) df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1) df['pred_queries_max'] = df[cols_pred_queries].max(axis=1) df['pred_queries_min'] = df[cols_pred_queries].min(axis=1) df['pred_queries_var'] = df[cols_pred_queries].var(axis=1) df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1) df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean( axis=1) df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum( axis=1) df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max( axis=1) df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min( axis=1) df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var( axis=1) df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew( axis=1) # stats features for each classes print('stats features...') for i in tqdm(range(0, 12)): cols = [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ] df['pred_mean{}'.format(i)] = df[cols].mean(axis=1) df['pred_sum{}'.format(i)] = df[cols].sum(axis=1) df['pred_max{}'.format(i)] = df[cols].max(axis=1) df['pred_min{}'.format(i)] = df[cols].min(axis=1) df['pred_var{}'.format(i)] = df[cols].var(axis=1) df['pred_skew{}'.format(i)] = df[cols].skew(axis=1) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1) df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1) df['target_max{}'.format(i)] = df[cols_target].max(axis=1) df['target_min{}'.format(i)] = df[cols_target].min(axis=1) df['target_var{}'.format(i)] = df[cols_target].var(axis=1) df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1) # post processing cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] print('post processing...') for i in tqdm(range(1, 12)): tmp = np.zeros(len(df)) for c in cols_transport_mode: tmp += (df[c] == i).astype(int) cols_target = [c for c in df.columns if '_target_{}'.format(i) in c] for c in cols_target + [ 'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i) ]: df[c] = df[c] * (tmp > 0) # reduce memory usage df = reduce_mem_usage(df) # split data by city df1 = df[df['y_o'] > 37.5] df2 = df[df['y_o'] < 27.5] df3 = df[df['x_o'] > 120.0] del df gc.collect() # cols for target encoding cols_target_encoding = [ 'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour', 'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans', 'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round', 'queries_distance_round' ] cols_ratio_plan = [ 'plan_price_distance_ratio_max_plan', 'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan', 'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan', 'plan_distance_eta_ratio_min_plan', 'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan', 'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan', 'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan', 'plan_price_distance_eta_prod_max_plan', 'plan_price_distance_eta_prod_min_plan', 'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan', 'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan', 'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan', 'plan_price_distance_prod_ratio_0_max_plan', 'plan_price_distance_prod_ratio_0_min_plan', 'plan_price_eta_prod_ratio_0_max_plan', 'plan_price_eta_prod_ratio_0_min_plan', 'plan_distance_eta_prod_ratio_0_max_plan', 'plan_distance_eta_prod_ratio_0_min_plan', 'plan_price_distance_eta_prod_ratio_0_max_plan', 'plan_price_distance_eta_prod_ratio_0_min_plan' ] cols_min_max_plan = [ 'plan_distance_max_plan', 'plan_distance_min_plan', 'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan', 'plan_eta_min_plan' ] cols_transport_mode = [ 'plan_{}_transport_mode'.format(i) for i in range(0, 7) ] cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [ 'profile_k_means' ] # target encoding for each cities print('traget encoding...') for i, df in tqdm(enumerate([df1, df2, df3])): # target encoding df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding) # change dtype for col in df.columns.tolist(): if df[col].dtypes == 'float16': df[col] = df[col].astype(np.float32) # remove missing variables col_missing = removeMissingVariables(df, 0.75) df.drop(col_missing, axis=1, inplace=True) # remove correlated variables col_drop = removeCorrelatedVariables(df, 0.95) df.drop(col_drop, axis=1, inplace=True) # save as feather to_feature(df, '../features/feats{}'.format(i + 1)) # save feature name list features_json = {'features': df.columns.tolist()} to_json(features_json, '../features/00{}_all_features.json'.format(i + 1)) del df gc.collect() line_notify('{} finished.'.format(sys.argv[0]))
import utils_agg import utils utils.start(__file__) #============================================================================== PREF = 'f204_' KEY = 'SK_ID_PREV' month_start = -12 * 3 # -96 month_end = -12 * 2 # -96 os.system(f'rm ../feature_prev/t*_{PREF}*') # ============================================================================= # # ============================================================================= pos = utils.read_pickles('../data/POS_CASH_balance') pos = pos[pos['MONTHS_BALANCE'].between(month_start, month_end)] col_cat = ['NAME_CONTRACT_STATUS'] train = utils.read_pickles('../data/prev_train', [KEY]) test = utils.read_pickles('../data/prev_test', [KEY]) # ============================================================================= # # ============================================================================= def aggregate(): df = utils.get_dummies(pos)
import os import utils utils.start(__file__) #============================================================================== PREF = 'f110_' KEY = 'SK_ID_CURR' os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # load # ============================================================================= train = utils.load_train(['SK_ID_CURR']).set_index('SK_ID_CURR') test = utils.load_test(['SK_ID_CURR']).set_index('SK_ID_CURR') prev = utils.read_pickles('../data/previous_application', ['SK_ID_CURR', 'SK_ID_PREV']) # ============================================================================= # prev # ============================================================================= gr = prev.groupby('SK_ID_CURR') train['SK_ID_PREV_min'] = gr.SK_ID_PREV.min() train['SK_ID_PREV_mean'] = gr.SK_ID_PREV.mean() train['SK_ID_PREV_max'] = gr.SK_ID_PREV.max() train['SK_ID_PREV_median'] = gr.SK_ID_PREV.median() train['SK_ID_PREV_std'] = gr.SK_ID_PREV.std() train['SK_ID_PREV_std-d-mean'] = train['SK_ID_PREV_std'] / train[ 'SK_ID_PREV_mean'] train[ 'SK_ID_PREV_max-m-min'] = train['SK_ID_PREV_max'] - train['SK_ID_PREV_min']
import utils_agg import utils utils.start(__file__) #============================================================================== PREF = 'f506_' KEY = 'SK_ID_CURR' day_start = -365*0.5 # -2922 day_end = -365*0 # -2922 os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # # ============================================================================= bure = utils.read_pickles('../data/bureau') bure = bure[bure['DAYS_CREDIT'].between(day_start, day_end)] col_cat = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'] train = utils.load_train([KEY]) test = utils.load_test([KEY]) # ============================================================================= # # ============================================================================= def aggregate(): df = utils.get_dummies(bure) li = []
import pandas as pd import gc import numpy as np from tqdm import tqdm from collections import defaultdict from itertools import product import utils utils.start(__file__) #============================================================================== # load #============================================================================== usecols = ['user_id', 'order_number', 'product_id', 'product_name', 'order_id', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3]) order_tbl = log[['order_id', 'user_id', 'order_number', 'order_number_rev']].drop_duplicates().reset_index(drop=True) for i in range(1, 4): order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby('user_id')['order_id'].shift(i) order_tbl.dropna(inplace=True) #order_pids = log.head(999999).groupby('order_id').product_id.apply(set).reset_index() order_pids = log.groupby('order_id').product_id.apply(set).reset_index() order_tbl = pd.merge(order_tbl, order_pids.add_prefix('t-1_'), on='t-1_order_id', how='inner') order_tbl = pd.merge(order_tbl, order_pids.add_prefix('t-2_'), on='t-2_order_id', how='inner')
# setting day_start = -365*4 # min: -2922 day_end = -365*3 # min: -2922 month_round = 1 PREF = 'ins_304_' KEY = 'SK_ID_CURR' os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # load # ============================================================================= #ins = pd.read_csv('/Users/Kazuki/Home-Credit-Default-Risk/sample/sample_ins_0.csv') ins = utils.read_pickles('../data/installments_payments') #ins.drop('SK_ID_PREV', axis=1, inplace=True) ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)] col_delayed_day = [] col_delayed_money = [] col_delayed_money_ratio = [] for i in range(0, 50, 5): c1 = f'delayed_day_{i}' ins[c1] = (ins['days_delayed_payment']>i)*1 col_delayed_day.append(c1) c2 = f'delayed_money_{i}' ins[c2] = ins[c1] * ins.AMT_PAYMENT col_delayed_money.append(c2)
*リークじゃない """ import pandas as pd import numpy as np from tqdm import tqdm import gc import utils utils.start(__file__) #============================================================================== # mk train * test log #============================================================================== tbl = utils.read_pickles('../input/mk/days_since_last_order') #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T==-1: folder = 'test' else: folder = 'trainT-'+str(T) label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
""" import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) #============================================================================== # load #============================================================================== col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T == -1: folder = 'test' else: folder = 'trainT-' + str(T)
""" import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) #============================================================================== # load #============================================================================== X_base = pd.read_pickle('../feature/X_base_t3.p') col = ['order_id', 'user_id', 'product_id', 'order_dow', 'order_hour_of_day', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', col) log = pd.merge(log, pd.read_pickle('../input/mk/timezone.p'), on='order_hour_of_day', how='left') log['dow_tz'] = log.order_dow.map(str) + '_' + log.timezone #============================================================================== # train #============================================================================== def make(T): log_tr = log[log.order_number_rev>T] # dow dow = pd.crosstab(log_tr.user_id, log_tr.order_dow).add_prefix('user_dow_freq_') dow_ = pd.crosstab(log_tr.user_id, log_tr.order_dow, normalize='index').add_prefix('user_dow_norm_') # timezone
import utils_agg import utils utils.start(__file__) #============================================================================== PREF = 'f205_' KEY = 'SK_ID_CURR' month_start = -12 * 0.5 # -96 month_end = -12 * 0 # -96 os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # # ============================================================================= pos = utils.read_pickles('../data/POS_CASH_balance') pos = pos[pos['MONTHS_BALANCE'].between(month_start, month_end)] col_cat = ['NAME_CONTRACT_STATUS'] train = utils.load_train([KEY]) test = utils.load_test([KEY]) # ============================================================================= # # ============================================================================= def aggregate(): df = utils.get_dummies(pos)
Created on Wed Jun 14 00:00:43 2017 @author: konodera oder_num - last_order_num """ import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'order_number']) orders = pd.read_csv('../input/orders.csv.gz', usecols=['order_id', 'order_number']) X_base = pd.read_pickle('../feature/X_base_t3.p') X_base = pd.merge(X_base, orders, on='order_id', how='left') #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0'
import utils_agg import utils utils.start(__file__) #============================================================================== PREF = 'f401_' KEY = 'SK_ID_PREV' month_start = -12 * 10 # -96 month_end = -12 * 0 # -96 os.system(f'rm ../feature_prev/t*_{PREF}*') # ============================================================================= # # ============================================================================= cre = utils.read_pickles('../data/credit_card_balance') cre = cre[cre['MONTHS_BALANCE'].between(month_start, month_end)] col_cat = ['NAME_CONTRACT_STATUS'] train = utils.read_pickles('../data/prev_train', [KEY]) test = utils.read_pickles('../data/prev_test', [KEY]) # ============================================================================= # # ============================================================================= def aggregate(): df = utils.get_dummies(cre)
def main(): # load pkls df = read_pickles('../feats/sales') df_calendar = loadpkl('../feats/calendar.pkl') df_sell_prices = loadpkl('../feats/sell_prices.pkl') # merge df = df.merge(df_calendar, on='d', how='left') df = df.merge(df_sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left') del df_calendar, df_sell_prices gc.collect() # drop pre-release rows df = df[df['wm_yr_wk'] >= df['release']] # make lag features df = make_lags(df, 14) # add categorical features df['item_id_store_id'] = df['item_id'] + '_' + df['store_id'] df['item_id_state_id'] = df['item_id'] + '_' + df['state_id'] df['dept_id_store_id'] = df['dept_id'] + '_' + df['store_id'] df['dept_id_state_id'] = df['dept_id'] + '_' + df['state_id'] # label encoding cols_string = [ 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'item_id_store_id', 'item_id_state_id', 'dept_id_store_id', 'dept_id_state_id' ] for c in cols_string: df[c], _ = pd.factorize(df[c]) df[c].replace(-1, np.nan, inplace=True) # add price features df_grouped = df[['id', 'sell_price']].groupby('id')['sell_price'] df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1)) df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1']) df['rolling_price_max_t365'] = df_grouped.transform( lambda x: x.shift(1).rolling(365).max()) df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price'] ) / (df['rolling_price_max_t365']) df['rolling_price_std_t7'] = df_grouped.transform( lambda x: x.rolling(7).std()) df['rolling_price_std_t30'] = df_grouped.transform( lambda x: x.rolling(30).std()) # features release date df['release'] = df['release'] - df['release'].min() # price momentum by month & year df['price_momentum_m'] = df['sell_price'] / df.groupby( ['store_id', 'item_id', 'month'])['sell_price'].transform('mean') df['price_momentum_y'] = df['sell_price'] / df.groupby( ['store_id', 'item_id', 'year'])['sell_price'].transform('mean') # days for CustomTimeSeriesSplitter df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int) # reduce memory usage df = reduce_mem_usage(df) # save as feather to_feature(df, '../feats/f103') # save feature name list features_json = {'features': df.columns.tolist()} to_json(features_json, '../configs/103_all_features_14days.json') # LINE notify line_notify('{} done.'.format(sys.argv[0]))
def mk_submit(HEAD): SUBMIT_FILE_PATH_ = SUBMIT_FILE_PATH.replace('feature', str(HEAD)) files_tr = ('../feature/train_' + imp.head(HEAD).feature + '.f').tolist() files_te = ('../feature/test_' + imp.head(HEAD).feature + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH_.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) COL = X_train.columns.tolist() # test X_test = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(feature {HEAD}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH_, index=False, compression='gzip')
import utils utils.start(__file__) #============================================================================== PREF = 'f305_' KEY = 'SK_ID_CURR' day_start = -365 * 10 # min: -2922 day_end = -365 * 0 # min: -2922 os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # # ============================================================================= prev = utils.read_pickles('../data/previous_application', ['SK_ID_PREV', 'NAME_CONTRACT_TYPE']) train = utils.load_train([KEY]) test = utils.load_test([KEY]) # ============================================================================= # # ============================================================================= def multi_agg(args): path, pref, cont_type, cont_type_pref = args print(args) ins = utils.read_pickles(path) ins = ins[ins['DAYS_INSTALMENT'].between(day_start, day_end)] ins = pd.merge(ins, prev, on='SK_ID_PREV', how='left')
import os from sklearn.preprocessing import LabelEncoder import utils utils.start(__file__) #============================================================================== PREF = 'f159_' KEY = 'SK_ID_CURR' os.system(f'rm ../feature/t*_{PREF}*') # ============================================================================= # load # ============================================================================= prev = utils.read_pickles('../data/future_application') #base = prev[[KEY]].drop_duplicates().set_index(KEY) # latest prev_l = prev.sort_values([KEY, 'DAYS_DECISION'], ascending=[True, False]).drop_duplicates( KEY, keep='last').reset_index(drop=True) # ============================================================================= # label encoding # ============================================================================= categorical_features = prev_l.select_dtypes('O').columns.tolist() """ ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START',
LOOP = 20 #============================================================================== # load #============================================================================== order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[[ 'order_id', 'user_id', 'order_number' ]].sort_values(['user_id', 'order_number', 'order_id']) for i in range(1, LOOP): order_tbl['t-{}_order_id'.format(i)] = order_tbl.groupby( 'user_id')['order_id'].shift(i) col = [c for c in order_tbl.columns if 'order_id' in c] order_tbl = order_tbl[col] col = ['order_id', 'user_id', 'order_number', 'product_id', 'reordered'] log = utils.read_pickles('../input/mk/log', col) log.sort_values(['user_id', 'order_number', 'product_id'], inplace=True) #============================================================================== # def #============================================================================== def multi(T): """ T = 0 folder = 'trainT-0' """ if T == -1: folder = 'test' else: folder = 'trainT-' + str(T)
import pandas as pd import gc from glob import glob from tqdm import tqdm from multiprocessing import Pool import utils utils.start(__file__) #============================================================================== KEY = 'SK_ID_CURR' PREF = 'prev_104' # ============================================================================= # feature # ============================================================================= prev = utils.read_pickles('../data/previous_application') prev = prev[prev['NAME_CONTRACT_STATUS'] == 'Unused offer'] prev = utils.get_dummies(prev) prev.columns = [c.replace('/', '') for c in prev.columns] prev.drop('SK_ID_PREV', axis=1, inplace=True) base = prev[[KEY]].drop_duplicates().set_index(KEY) gr = prev.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY])
files = sorted(glob('../feature/train*.f')) remove_files = [] if len(remove_names) > 0: for i in files: for j in remove_names: if i.endswith(j + '.f'): remove_files.append(i) break print(f'remove {len(remove_files)} files') files = sorted(list(set(files) - set(remove_files))) print(f'read {len(files)} files') X_train = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=100)], axis=1) y_train = utils.read_pickles('../data/label').TARGET if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') dtrain = lgb.Dataset(X_train, y_train, categorical_feature=categorical_feature) del X_train, y_train gc.collect() models = [] for i in range(LOOP): gc.collect()
import pandas as pd import sys sys.path.append('/home/kazuki_onodera/Python') import lgbmextension as ex import lightgbm as lgb import multiprocessing from glob import glob import utils utils.start(__file__) #============================================================================== SEED = 71 folders = sorted(glob('../data/*_train')) X = pd.concat([utils.read_pickles(f) for f in (folders)], axis=1) y = utils.read_pickles('../data/label').TARGET if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') print(f'folders: {folders}') param = { 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'max_depth': -1, 'num_leaves': 255, 'max_bin': 255,
@author: konodera """ import pandas as pd import numpy as np from tqdm import tqdm from collections import defaultdict import utils #utils.start(__file__) #============================================================================== # load #============================================================================== col = ['user_id', 'order_number', 'order_id'] log = utils.read_pickles('../input/mk/log', col).drop_duplicates().sort_values(col) ai_dep = pd.read_pickle('../input/mk/order_aisle-department.p') log = pd.merge(log, ai_dep, on='order_id', how='left') #============================================================================== # calc #============================================================================== col = [c for c in log.columns if 'aisle_' in c or 'dep' in c] di = defaultdict(int) uid_bk = None li1 = [] for args in tqdm(log[['user_id']+col].values): uid = args[0]
X_head = X.head().drop('is_attributed', axis=1) X_head.to_pickle('X_head.p') del X gc.collect() system('rm ../data/803_tmp*.p') """ X_head = pd.read_pickle('X_head.p') """ # ============================================================================= # # test # ============================================================================= sub = utils.read_pickles('../data/test_old', ['click_id']) load_folders = sorted(glob('../data/*_test/')) + ['../data/test_old/'] args = list(zip(load_folders, range(len(load_folders)))) pool = Pool(15) pool.map(multi_test, args) pool.close() print('concat test') load_files = sorted(glob('../data/803_tmp*.p')) X = pd.concat([pd.read_pickle(f) for f in load_files], axis=1) print('test.shape should be 18790469:', X[X_head.columns].shape) print('X.isnull().sum().sum():', X.isnull().sum().sum()) system('rm -rf ../data/dtest')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri May 26 17:07:09 2017 @author: konodera """ import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) col = ['order_id', 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', col).rename(columns={'reordered':'y'}) test_order = pd.read_pickle('../input/mk/test_user.p') #============================================================================== # train #============================================================================== def make(T): label_t1 = log[log.order_number_rev>T] label_t1.drop_duplicates(['user_id','product_id'], keep='last', inplace=True) label_t1.sort_values(['user_id','product_id'], inplace=True) label_t0_y1 = log.loc[log.order_number_rev==T].loc[log.y==1] label_t0_y1.sort_values(['user_id','product_id'], inplace=True) label_t1['key'] = label_t1.user_id.map(str) + ' ' + label_t1.product_id.map(str)
#import numpy as np import pandas as pd #from tqdm import tqdm import gc import os from glob import glob from multiprocessing import Pool nthread = 12 #from collections import defaultdict import utils utils.start(__file__) os.system('rm -rf ../data/004*') trte = pd.concat([ utils.read_pickles('../data/train', ['ip', 'app', 'device', 'os', 'channel', 'click_time']), utils.read_pickles('../data/test_old', ['ip', 'app', 'device', 'os', 'channel', 'click_time']) ], ignore_index=True) def multi(count_keys): """ ex: count_keys = ('app', 'device') """ gc.collect() print(count_keys)
import multiprocessing as mp import utils utils.start(__file__) #============================================================================== # load #============================================================================== col = ['order_id', 'user_id', 'product_name', 't-1_product_name', 'order_number', 'order_number_rev'] order_tbl = pd.read_pickle('../input/mk/order_tbl.p')[col] order_tbl.sort_values(['user_id','order_number'], inplace=1) order_tbl['t-1_order_id'] = order_tbl.groupby('user_id')['order_id'].shift(1) order_tbl.reset_index(drop=True, inplace=True) prods = pd.read_pickle('../input/mk/goods.p')[['product_id','product_name']] log = utils.read_pickles('../input/mk/log', ['order_id', 'product_id', 'order_number_rev']) order_item_array = log.groupby('order_id').product_id.apply(np.array).reset_index() del log; gc.collect() #============================================================================== # def #============================================================================== #def multi1(i): # try: # item_prior, item_now = order_tbl.loc[i,['t-1_product_name', 'product_name']] # item2item = [i1+' -> '+i2 for i1, i2 in list(product(item_prior, item_now))] # return Counter(item2item) # except: # return # #def multi2(i): # try:
import pandas as pd import numpy as np from collections import defaultdict from tqdm import tqdm import utils utils.start(__file__) #============================================================================== # load #============================================================================== usecols = [ 'user_id', 'product_id', 'order_number', 'reordered', 'order_number_rev' ] log = utils.read_pickles('../input/mk/log', usecols).sort_values(usecols[:3]) #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T == -1: folder = 'test' else: folder = 'trainT-' + str(T)
Created on Mon Aug 7 13:58:58 2017 @author: konodera """ import pandas as pd import numpy as np import utils #utils.start(__file__) #============================================================================== # load #============================================================================== usecols = ['product_id', 'order_dow', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', usecols) #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T==-1: folder = 'test' else: folder = 'trainT-'+str(T)
# for r in REMOVE_FEATURES: # if r in f: # sw = True # break # if not sw: # tmp.append(f) #files = tmp print('features:', len(files)) X = pd.concat([ pd.read_feather(f).head(HEAD) for f in tqdm(files, mininterval=60) ], axis=1) y = utils.read_pickles('../data/label').head(HEAD).TARGET if X.columns.duplicated().sum()>0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') #X = X.rank(method='dense') gc.collect() CAT = list( set(X.columns)&set(utils_cat.ALL)) # ============================================================================= # imp # =============================================================================
@author: konodera """ import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) #============================================================================== # load #============================================================================== col = ['order_id', 'user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', col).sort_values(['user_id', 'product_id', 'order_number']) #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T==-1: folder = 'test' else: folder = 'trainT-'+str(T)
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = loader.train() X_train_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) X_train = pd.concat([X_train, X_train_], axis=1) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) COL = X_train.columns.tolist() # test X_test = loader.test() X_test_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1) X_test = pd.concat([X_test, X_test_], axis=1)[COL] # ============================================================================= # groupKfold # ============================================================================= sk_tbl = pd.read_csv('../data/user_id_v8.csv.gz') # TODO: check user_tbl = sk_tbl.user_id.drop_duplicates().reset_index( drop=True).to_frame() sub_train = pd.read_csv('../input/application_train.csv.zip', usecols=['SK_ID_CURR']).set_index('SK_ID_CURR') sub_train['y'] = y_train.values group_kfold = GroupKFold(n_splits=NFOLD) # ============================================================================= # training with cv # ============================================================================= model_all = [] auc_mean = 0 for i in range(LOOP): dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) # shuffle fold ids = list(range(user_tbl.shape[0])) np.random.shuffle(ids) user_tbl['g'] = np.array(ids) % NFOLD sk_tbl_ = pd.merge(sk_tbl, user_tbl, on='user_id', how='left').set_index('SK_ID_CURR') sub_train['g'] = sk_tbl_.g folds = group_kfold.split(X_train, sub_train['y'], sub_train['g']) gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, folds=folds, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models auc_mean += ret['auc-mean'][-1] auc_mean /= LOOP result = f"CV auc-mean({COMMENT}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
@author: konodera """ import pandas as pd import numpy as np from tqdm import tqdm import utils utils.start(__file__) #============================================================================== # load #============================================================================== col = ['order_id', 'user_id', 'product_id', 'order_number_rev'] log = utils.read_pickles('../input/mk/log', col).sort_values('user_id') streak = pd.read_pickle('../input/mk/streak_order-product.p') #============================================================================== # def #============================================================================== def make(T): """ T = 0 folder = 'trainT-0' """ if T==-1: folder = 'test' else: folder = 'trainT-'+str(T)
from sklearn.preprocessing import LabelEncoder import utils utils.start(__file__) #============================================================================== PREF = 'f402_' KEY = 'SK_ID_PREV' month_start = -12*1 # -96 month_end = -12*0 # -96 os.system(f'rm ../feature_prev/t*_{PREF}*') # ============================================================================= # load # ============================================================================= cre = utils.read_pickles('../data/credit_card_balance').drop('SK_ID_CURR', axis=1) cre = cre[cre['MONTHS_BALANCE'].between(month_start, month_end)] train = utils.read_pickles('../data/prev_train', [KEY]) test = utils.read_pickles('../data/prev_test', [KEY]) le = LabelEncoder() cre['NAME_CONTRACT_STATUS'] = le.fit_transform( cre['NAME_CONTRACT_STATUS'] ) col = [c for c in cre.columns if c.startswith('app_')] cre.drop(col, axis=1, inplace=True) # ============================================================================= # feature # ============================================================================= df = pd.pivot_table(cre, index=KEY, columns='MONTHS_BALANCE') df.columns = pd.Index([f'{e[0]}_{e[1]}' for e in df.columns.tolist()])