def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = ["subset_exp_" + str(idx) for idx in range(100)] files = [(f, f) for f in files] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") sig_idx = self.do_preds(train, files) base_files = ["subset_exp_" + str(idx) for idx in sig_idx] base_files = [(f, f) for f in base_files] bin_files = ["subset_exp_" + str(idx) for idx in range(10)] bin_files = [(f, "bin" + f) for f in bin_files] train, test = self.make_files(base_files, bin_files) files = base_files + bin_files self.print_corr(train, test, files) self.do_cv_pred(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) sig_idx = [ 0, 9, 12, 13, 18, 19, 22, 24, 28, 29, 33, 35, 36, 37, 43, 45, 52, 54, 55, 56, 58, 59, 61, 65, 67, 69, 71, 72, 74, 75, 76, 77, 78, 81, 83, 84, 88, 90, 91, 97, 98 ] pos_sig_idx = [ 9, 12, 19, 24, 33, 36, 45, 52, 54, 55, 58, 59, 65, 67, 69, 72, 74, 75, 77, 78, 83, 88, 91, 97 ] sig2_idx = [12, 24, 36, 58, 65, 67, 69, 72, 75, 91, 97] # files = ["subset_exp_" + str(idx) for idx in range(100)] files = ["subset_exp_" + str(idx) for idx in sig_idx] files = [(f, f) for f in files] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_cv_pred(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = [ ("team_v63", "lgb"), ("big_mlp", "mlp"), ] team_files = [ 'select_v44_ridge', 'tune_stack_57_v1', 'select_v51_ridge', # 'tune_stack_57_2_v1', 'tune_stack_cgb_v1', # 'elo_rnd_feat_bridge', 'outlier_lgb_v3_kh_time_feature2_pocket', # 'delete_outlier_kh_pocket_stack_correct_ridge', # 'outlier_lgb_pocket_logistic', 'delete_outlier_kh_pocket_stack_correct2_ridge' ] files = files + [(t, t) for t in team_files] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_cv_pred(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = [ ("org_param", "big"), ("medium", "medium"), ("mlp3", "mlp"), # ("mlp_rank", "mlp_rank"), ("bin", "bin"), ("no_out2", "no_out2"), ("bin_large", "bin_large"), ("no_out_large", "no_out_large"), ("tune_param", "tune_param") ] train, test = self.make_files(files) timer.time("load csv in ") train["no_out2"] = (1 - train["bin"]) * train["no_out2"] train["no_out_large"] = (1 - train["bin_large"]) * train["no_out_large"] print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_preds(train, test, files)
def load_whole_input(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.TRAIN1) test = csv_io.read_file(path_const.TEST1) use_files = [ path_const.RE_NEW_TRANS1, path_const.RE_OLD_TRANS1, path_const.OLD_TRANS3, path_const.NEW_TRANS6, path_const.OLD_TRANS6, path_const.OLD_TRANS9, # path_const.NEW_TRANS11, # path_const.OLD_TRANS11, ] for f in use_files: train, test = self.load_file_and_merge(train, test, f, csv_io) pred_train = csv_io.read_file(path_const.NEW_DAY_PRED_OOF) pred_test = csv_io.read_file(path_const.NEW_DAY_PRED_SUB) train = pd.merge(train, pred_train, on="card_id", how="left") test = pd.merge(test, pred_test, on="card_id", how="left") # train, test = self.load_lda(train, test, csv_io) print(train.shape) print(test.shape) timer.time("load csv in ") fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) return train, test
def load_ts(): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() num = csv_io.read_file(path_const.NEW_NUM) cat = csv_io.read_file(path_const.NEW_CAT) key = csv_io.read_file(path_const.NEW_KEY) timer.time("load ts")
def __init__(self): self.small_col = [ # "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # 0.001 "old_trans_elapsed_days_max", "old_trans_elapsed_days_min", "old_trans_elapsed_days_mean", # 0.025 mean001 # "new_last_day", # 0.005 "new_to_last_day", "old_installments_sum", "old_installments_mean", # 0.005 "old_month_nunique", "old_woy_nunique", # 0.010 "old_merchant_id_nunique", # 0.002 "new_month_lag_mean", "old_month_lag_mean", "elapsed_days", # 0.010 "new_purchase_amount_max", "new_purchase_amount_count", "new_purchase_amount_mean", # 0.020 "old_purchase_amount_max", "old_purchase_amount_count", "old_purchase_amount_mean", # 0.002 "old_category_1_mean", "new_category_1_mean", # 0.006 "old_authorized_flag_sum", # "old_authorized_flag_mean", bad? "old_no_city_purchase_amount_min", # 0.003 "old_no_city_purchase_amount_max", "old_no_city_purchase_amount_mean", # 0.002 "rec1_purchase_amount_count", # 0.005 "old_month_lag_max", # 0.002 "new_time_diff_mean", "new_trans_elapsed_days_std", # 0.002 "old_month_diff_mean", "old_pa2_month_diff_min", # 0.004 # "old_mer_cnt_whole_mean", # 0.001 ] lda_col = [ 'lda-merchant_category_id-0', 'lda-merchant_category_id-1', 'lda-merchant_category_id-2', 'lda-merchant_category_id-3', 'lda-merchant_category_id-4', 'lda-month_lag-0', 'lda-month_lag-1', 'lda-month_lag-2', 'lda-month_lag-3', 'lda-month_lag-4', ] # self.small_col += lda_col self.medium_col = self.small_col + ["pred_diff"] self.logger = pocket_logger.get_my_logger() self.timer = pocket_timer.GoldenTimer(self.logger)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = ens_loader.get_team_ens() train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_preds(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = ["subset_exp_" + str(idx) for idx in range(40, 70)] files = [(f, f) for f in files] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_preds(train, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = [("team_v63", "lgb"), ("bin_team", "bin"), ("no_out_team", "no_out"), ("rnd_feat_bridge", "rnd_feat"), ("small_team", "small")] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_cv_pred(train, test, files)
def get_cv_score(param): local_timer = pocket_timer.GoldenTimer(logger) lgb = pocket_lgb.OptLgb(param) for train_index, test_index in skf.split(train, outliers): X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index] model = lgb.do_train_direct(X_train, X_test, y_train, y_test) valid_set_pred = model.predict(X_test) train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) local_timer.time("end train in ") train_output = pd.merge(train_output, train, on="card_id", how="left") score = evaluator.rmse(train_output["target"], train_output["cv_pred"]) return score
def __init__(self, prefix="", split_num=32): self._SPLIT_NUM = split_num self.fer = agg_fe.AggFe(prefix) self.timer = pocket_timer.GoldenTimer()
def __init__(self, fer, split_num=32): self._SPLIT_NUM = split_num self.fer = fer self.timer = pocket_timer.GoldenTimer()
def __init__(self): self.small_col = [ # "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # 0.001 "old_trans_elapsed_days_max", "old_trans_elapsed_days_min", "old_trans_elapsed_days_mean", # 0.025 mean001 # "new_last_day", # 0.005 "new_to_last_day", "old_installments_sum", "old_installments_mean", # 0.005 "old_month_nunique", "old_woy_nunique", # 0.010 "old_merchant_id_nunique", # 0.002 "new_month_lag_mean", "old_month_lag_mean", "elapsed_days", # 0.010 "new_purchase_amount_max", "new_purchase_amount_count", "new_purchase_amount_mean", # 0.020 "old_purchase_amount_max", "old_purchase_amount_count", "old_purchase_amount_mean", # 0.002 "old_category_1_mean", "new_category_1_mean", # 0.006 "old_authorized_flag_sum", # "old_authorized_flag_mean", bad? "old_no_city_purchase_amount_min", # 0.003 "old_no_city_purchase_amount_max", "old_no_city_purchase_amount_mean", # 0.002 "rec1_purchase_amount_count", # 0.005 "old_month_lag_max", # 0.002 "new_time_diff_mean", "new_trans_elapsed_days_std", # 0.002 "old_month_diff_mean", "old_pa2_month_diff_min", # 0.004 "old_mer_cnt_whole_mean", # 0.001 ] self.medium_col = self.small_col + ["pred_diff"] self.drop_col = [ "card_id", "target", # "feature_1", "feature_2", "feature_3", "old_weekend_mean", "new_weekend_mean", "new_authorized_flag_mean", "old_null_state", "new_null_state", "new_null_install", # "old_null_install", "old_cat3_pur_mean", "new_cat3_pur_mean", "old_cat2_pur_mean", "new_cat2_pur_mean", "new_category_4_mean", # "new_merchant_group_id_nunique", "old_merchant_group_id_nunique" "new_mon_nunique_mean", "new_woy_nunique_mean", # "new_month_lag_ptp", "new_month_lag_min", "new_purchase_amount_skew", # "new_purchase_amount_std", "old_purchase_amount_skew", # "old_purchase_amount_std", # "new_category_2_nunique", "old_category_2_nunique", # "old_null_merchant", "new_null_merchant", # "old_ym_target_encode_mean", "new_ym_target_encode_mean", # "old_hour_target_encode_mean", "new_hour_target_encode_mean", # "old_subsector_id_target_encode_mean", # "new_merchant_id_target_encode_mean", "old_merchant_id_target_encode_mean", "pred_new", "old_same_buy_count", "old_purchase_amount_nunique", "new_purchase_amount_nunique", "old_installments_nunique", "new_installments_nunique", # "pred_new_pur_max", "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # +0.001 ] self.team_small_col = [ "first_mer_old_woy_nunique", "kh_hist_kh__purchase_date_seconds_diff_std", "merchant_id_most", "new_category_3_mean", "old_time_diff_std", "old_hour_0_count", "old_time_diff_min", "authorized_flag_y_ratio", "hist_merchant_id_nunique", "kh_ratio_kh__purchase_days_diff_min", "new_subsector_id_nunique", ] self.logger = pocket_logger.get_my_logger() self.timer = pocket_timer.GoldenTimer(self.logger)
def load_small_input(): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.TRAIN1) test = csv_io.read_file(path_const.TEST1) new_trans = csv_io.read_file(path_const.RE_NEW_TRANS1) old_trans = csv_io.read_file(path_const.RE_OLD_TRANS1) new_trans6 = csv_io.read_file(path_const.NEW_TRANS6) old_trans6 = csv_io.read_file(path_const.OLD_TRANS6) print(train.shape) print(test.shape) timer.time("load csv in ") train = pd.merge(train, new_trans, on="card_id", how="left") train = pd.merge(train, old_trans, on="card_id", how="left") train = pd.merge(train, new_trans6, on="card_id", how="left") train = pd.merge(train, old_trans6, on="card_id", how="left") # test = pd.merge(test, new_trans, on="card_id", how="left") test = pd.merge(test, old_trans, on="card_id", how="left") test = pd.merge(test, new_trans6, on="card_id", how="left") test = pd.merge(test, old_trans6, on="card_id", how="left") # print(train.shape) # print(test.shape) # fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) train_y = train["target"] # 3.660 - 3.658 use_col = [ "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # 0.001 "old_trans_elapsed_days_max", "old_trans_elapsed_days_min", "old_trans_elapsed_days_mean", # 0.025 mean001 "new_last_day", # 0.005 "old_installments_sum", "old_installments_mean", # 0.005 "old_month_nunique", "old_woy_nunique", # 0.010 "old_merchant_id_nunique", # 0.002 "new_month_lag_mean", "old_month_lag_mean", "elapsed_days", # 0.010 "new_purchase_amount_max", "new_purchase_amount_count", "new_purchase_amount_mean", # 0.020 "old_purchase_amount_max", "old_purchase_amount_count", "old_purchase_amount_mean", # 0.002 "old_category_1_mean", "new_category_1_mean", # 0.006 "old_authorized_flag_sum", # "old_authorized_flag_mean", bad? "old_no_city_purchase_amount_min", # 0.003 "old_no_city_purchase_amount_max", "old_no_city_purchase_amount_mean", # 0.002 "rec1_purchase_amount_count", # 0.005 "old_month_lag_max", # 0.002 "new_time_diff_mean", "new_trans_elapsed_days_std", # 0.002 "old_month_diff_mean", "old_pa2_month_diff_min", # 0.004 ] train_x = train[use_col] test_x = test[use_col] print(train_x.shape) print(train_y.shape) print(test_x.shape) timer.time("prepare train in ") return train[["card_id", "target"]], test[["card_id"]], train_x, train_y, test_x
import os, sys ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() loader = input_loader.GoldenLoader() _train, _test = loader.load_whole_input() timer.time("load csv") pred_col = loader.small_col def try_some(train, test): train_x, test_x = train[pred_col], test[pred_col] train_y = train["target"] print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"]
def do_cv(self, data): for d in data: print(d.shape) train, test, train_x, train_y, test_x = data timer = pocket_timer.GoldenTimer(self.logger) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv["cv_pred"] = 0 outliers = (train["target"] < -30).astype(int).values bagging_num = 1 split_num = 5 for bagging_index in range(bagging_num): skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) total_score = 0 train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): lr_schedule = learning_rate.GoldenLearningRate(0.1, 50).cosine_annealing_scheduler() mlp = pocket_network2.GoldenMlp2(self.epochs, self.batch_size, lr_schedule) network = mlp.build_model(train_x.shape[1]) X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index] print("start train") model, history = mlp.do_train_direct(str(idx), network, X_train, X_test, y_train, y_test) mlp.save_history(history, str(idx)) print('Loading Best Model') model.load_weights(path_const.get_weight_file(str(idx))) y_pred = model.predict(test_x, batch_size=self.batch_size) y_pred = np.reshape(y_pred, -1) y_pred = np.clip(y_pred, -33.219281, 18.0) valid_set_pred = model.predict(X_test, batch_size=self.batch_size) score = evaluator.rmse(y_test, valid_set_pred) print(score) total_score += score submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) timer.time("done one set in") train_output = pd.concat(train_preds, axis=0) train_cv["cv_pred"] += train_output["cv_pred"] avg_score = str(total_score / split_num) self.logger.print("average score= " + avg_score) timer.time("end train in ") submission["target"] = submission["target"] / (bagging_num * split_num) # submission["target"] = np.clip(submission["target"], -33.219281, 18.0) submission.to_csv(path_const.OUTPUT_SUB, index=False) train_cv["cv_pred"] = train_cv["cv_pred"] / bagging_num train_cv["cv_pred"] = np.clip(train_cv["cv_pred"], -33.219281, 18.0) train_cv.to_csv(path_const.OUTPUT_OOF, index=False) y_true = train_y y_pred = train_cv["cv_pred"] rmse_score = evaluator.rmse(y_true, y_pred) self.logger.print("evaluator rmse score= " + str(rmse_score)) print(train["target"].describe()) self.logger.print(train_cv.describe()) self.logger.print(submission.describe()) timer.time("done submission in ")