def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = ["subset_exp_" + str(idx) for idx in range(40, 70)] files = [(f, f) for f in files] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_preds(train, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = [ ("bin_team", "bin"), ] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_preds(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = [("team_v63", "lgb"), ("bin_team", "bin"), ("no_out_team", "no_out"), ("rnd_feat_bridge", "rnd_feat"), ("small_team", "small")] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_cv_pred(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) files = [ ("team_v63", "lgb"), ("bin_team", "bin"), ("no_out_team", "no_out"), # ("rnd_feat_bridge", "rnd_feat"), ("small_team", "small") ] team_files = [ 'select_v44_ridge', 'tune_stack_57_v1', 'select_v51_ridge', # 'tune_stack_57_2_v1', 'tune_stack_cgb_v1', # 'elo_rnd_feat_bridge', 'outlier_lgb_v3_kh_time_feature2_pocket', # 'delete_outlier_kh_pocket_stack_correct_ridge', # 'outlier_lgb_pocket_logistic', 'delete_outlier_kh_pocket_stack_correct2_ridge' ] team_files = [(t, t) for t in team_files] train, test = self.make_files(files) for f in team_files: train, test = self.add_team_file(f[0], f[1], train, test) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_cv_pred(train, test, files)
def doit(self): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) # (file_name, col_name) sig_idx = [ 0, 9, 12, 13, 18, 19, 22, 24, 28, 29, 33, 35, 36, 37, 43, 45, 52, 54, 55, 56, 58, 59, 61, 65, 67, 69, 71, 72, 74, 75, 76, 77, 78, 81, 83, 84, 88, 90, 91, 97, 98 ] # files = ["subset_exp_" + str(idx) for idx in range(100)] files = ["subset_exp_" + str(idx) for idx in sig_idx] files = [(f, f) for f in files] train, test = self.make_files(files) timer.time("load csv in ") print(train.describe()) self.print_corr(train, test, files) timer.time("corr check") self.print_score(train, files) timer.time("score check") self.do_cv_pred(train, test, files)
import os, sys ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() loader = input_loader.GoldenLoader() _train, _test = loader.load_whole_input() timer.time("load csv") pred_col = loader.small_col def try_some(train, test): train_x, test_x = train[pred_col], test[pred_col] train_y = train["target"] print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"]
def load_small_input(): logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.TRAIN1) test = csv_io.read_file(path_const.TEST1) new_trans = csv_io.read_file(path_const.RE_NEW_TRANS1) old_trans = csv_io.read_file(path_const.RE_OLD_TRANS1) new_trans6 = csv_io.read_file(path_const.NEW_TRANS6) old_trans6 = csv_io.read_file(path_const.OLD_TRANS6) print(train.shape) print(test.shape) timer.time("load csv in ") train = pd.merge(train, new_trans, on="card_id", how="left") train = pd.merge(train, old_trans, on="card_id", how="left") train = pd.merge(train, new_trans6, on="card_id", how="left") train = pd.merge(train, old_trans6, on="card_id", how="left") # test = pd.merge(test, new_trans, on="card_id", how="left") test = pd.merge(test, old_trans, on="card_id", how="left") test = pd.merge(test, new_trans6, on="card_id", how="left") test = pd.merge(test, old_trans6, on="card_id", how="left") # print(train.shape) # print(test.shape) # fer = jit_fe.JitFe() train = fer.do_fe(train) test = fer.do_fe(test) train_y = train["target"] # 3.660 - 3.658 use_col = [ "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # 0.001 "old_trans_elapsed_days_max", "old_trans_elapsed_days_min", "old_trans_elapsed_days_mean", # 0.025 mean001 "new_last_day", # 0.005 "old_installments_sum", "old_installments_mean", # 0.005 "old_month_nunique", "old_woy_nunique", # 0.010 "old_merchant_id_nunique", # 0.002 "new_month_lag_mean", "old_month_lag_mean", "elapsed_days", # 0.010 "new_purchase_amount_max", "new_purchase_amount_count", "new_purchase_amount_mean", # 0.020 "old_purchase_amount_max", "old_purchase_amount_count", "old_purchase_amount_mean", # 0.002 "old_category_1_mean", "new_category_1_mean", # 0.006 "old_authorized_flag_sum", # "old_authorized_flag_mean", bad? "old_no_city_purchase_amount_min", # 0.003 "old_no_city_purchase_amount_max", "old_no_city_purchase_amount_mean", # 0.002 "rec1_purchase_amount_count", # 0.005 "old_month_lag_max", # 0.002 "new_time_diff_mean", "new_trans_elapsed_days_std", # 0.002 "old_month_diff_mean", "old_pa2_month_diff_min", # 0.004 ] train_x = train[use_col] test_x = test[use_col] print(train_x.shape) print(train_y.shape) print(test_x.shape) timer.time("prepare train in ") return train[["card_id", "target"]], test[["card_id"]], train_x, train_y, test_x
def __init__(self): self.small_col = [ # "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # 0.001 "old_trans_elapsed_days_max", "old_trans_elapsed_days_min", "old_trans_elapsed_days_mean", # 0.025 mean001 # "new_last_day", # 0.005 "new_to_last_day", "old_installments_sum", "old_installments_mean", # 0.005 "old_month_nunique", "old_woy_nunique", # 0.010 "old_merchant_id_nunique", # 0.002 "new_month_lag_mean", "old_month_lag_mean", "elapsed_days", # 0.010 "new_purchase_amount_max", "new_purchase_amount_count", "new_purchase_amount_mean", # 0.020 "old_purchase_amount_max", "old_purchase_amount_count", "old_purchase_amount_mean", # 0.002 "old_category_1_mean", "new_category_1_mean", # 0.006 "old_authorized_flag_sum", # "old_authorized_flag_mean", bad? "old_no_city_purchase_amount_min", # 0.003 "old_no_city_purchase_amount_max", "old_no_city_purchase_amount_mean", # 0.002 "rec1_purchase_amount_count", # 0.005 "old_month_lag_max", # 0.002 "new_time_diff_mean", "new_trans_elapsed_days_std", # 0.002 "old_month_diff_mean", "old_pa2_month_diff_min", # 0.004 "old_mer_cnt_whole_mean", # 0.001 ] self.medium_col = self.small_col + ["pred_diff"] self.drop_col = [ "card_id", "target", # "feature_1", "feature_2", "feature_3", "old_weekend_mean", "new_weekend_mean", "new_authorized_flag_mean", "old_null_state", "new_null_state", "new_null_install", # "old_null_install", "old_cat3_pur_mean", "new_cat3_pur_mean", "old_cat2_pur_mean", "new_cat2_pur_mean", "new_category_4_mean", # "new_merchant_group_id_nunique", "old_merchant_group_id_nunique" "new_mon_nunique_mean", "new_woy_nunique_mean", # "new_month_lag_ptp", "new_month_lag_min", "new_purchase_amount_skew", # "new_purchase_amount_std", "old_purchase_amount_skew", # "old_purchase_amount_std", # "new_category_2_nunique", "old_category_2_nunique", # "old_null_merchant", "new_null_merchant", # "old_ym_target_encode_mean", "new_ym_target_encode_mean", # "old_hour_target_encode_mean", "new_hour_target_encode_mean", # "old_subsector_id_target_encode_mean", # "new_merchant_id_target_encode_mean", "old_merchant_id_target_encode_mean", "pred_new", "old_same_buy_count", "old_purchase_amount_nunique", "new_purchase_amount_nunique", "old_installments_nunique", "new_installments_nunique", # "pred_new_pur_max", "new_trans_elapsed_days_max", "new_trans_elapsed_days_min", "new_trans_elapsed_days_mean", # +0.001 ] self.team_small_col = [ "first_mer_old_woy_nunique", "kh_hist_kh__purchase_date_seconds_diff_std", "merchant_id_most", "new_category_3_mean", "old_time_diff_std", "old_hour_0_count", "old_time_diff_min", "authorized_flag_y_ratio", "hist_merchant_id_nunique", "kh_ratio_kh__purchase_days_diff_min", "new_subsector_id_nunique", ] self.logger = pocket_logger.get_my_logger() self.timer = pocket_timer.GoldenTimer(self.logger)
def __init__(self, epochs, batch_size): self.logger = pocket_logger.get_my_logger() self.epochs = epochs self.batch_size = batch_size