def make_files(self, files): train, test = input_loader.GoldenLoader().load_team_input_v63() # train = self.csv_io.read_file(path_const.ORG_TRAIN) train["has_new"] = np.where(train["new_purchase_amount_count"] >= 1, 1, 0) train = train[["card_id", "target", "has_new"]] # test = self.csv_io.read_file(path_const.ORG_TEST) test["has_new"] = np.where(test["new_purchase_amount_count"] >= 1, 1, 0) test = test[["card_id", "has_new"]] for f in files: train, test = self.add_file(f[0], f[1], train, test) return train, test
ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() loader = input_loader.GoldenLoader() _train, _test = loader.load_whole_input() timer.time("load csv") pred_col = loader.small_col def try_some(train, test): train_x, test_x = train[pred_col], test[pred_col] train_y = train["target"] print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0
ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_small_pred_new() train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) print(train_y.describe()) train_y = train_y.fillna(60) print(train_x.columns) timer.time("load csv") mean_val = 6.040527 train_x["mean_val"] = mean_val rmse_score = evaluator.rmse(train_y, train_x["mean_val"]) print(rmse_score) pred_col_name = "pred_new"
import os, sys ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.loader import input_loader logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train, test = input_loader.GoldenLoader().load_team_input_v63() org_train = csv_io.read_file(path_const.ORG_TRAIN)[["card_id"]] org_test = csv_io.read_file(path_const.ORG_TEST)[["card_id"]] timer.time("load csv") print(train["card_id"].head()) print(org_train["card_id"].head()) cat_cols = input_loader.GoldenLoader().get_team_cat_col() has_col = [c for c in cat_cols if c in train.columns] print(has_col) print(train[has_col].nunique())
ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train, test = input_loader.GoldenLoader().load_team_input_v63() timer.time("load csv") drop_col = ["card_id", "target", "outliers"] pred_col = [c for c in train.columns if c not in drop_col] train_x = train[pred_col] train_y = train["target"] test_x = test[pred_col] print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame()
import os, sys ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_small_pred_new("new_purchase_amount_max") train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) print(train_y.describe()) train_y = train_y.fillna(-0.746893) print(train_x.columns) timer.time("load csv") mean_val = -0.198848 train_x["mean_val"] = mean_val rmse_score = evaluator.rmse(train_y, train_x["mean_val"]) print(rmse_score) pred_col_name = "pred_new_pur_max"
import os, sys ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io from elo.loader import input_loader logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train, test = input_loader.GoldenLoader().load_small_for_share() timer.time("load csv") train.to_pickle("../output/pocket_train_small_feats.pkl") test.to_pickle("../output/pocket_test_small_feats.pkl")
os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection import optuna logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_small_input() timer.time("load csv") train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) outliers = (train["target"] < -30).astype(int).values split_num = 5 random_state = 4590 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=random_state) total_score = 0 models = []
ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_large_input() timer.time("load csv") train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv["cv_pred"] = 0 outliers = (train["target"] < -30).astype(int).values
ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_medium_with_team() timer.time("load csv") train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv["cv_pred"] = 0 outliers = (train["target"] < -30).astype(int).values
import os, sys ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import numpy as np import pandas as pd import datetime from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.loader import input_loader logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train, test = input_loader.GoldenLoader().load_small_input()
ROOT = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')) sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_medium_input() timer.time("load csv") train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv["cv_pred"] = 0 outliers = (train["target"] < -30).astype(int).values
sys.path.append(ROOT) import pandas as pd import numpy as np from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.loader import input_loader from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() # base score=3.636285831929972 # pl in a cv way train, test = input_loader.GoldenLoader().load_team_input_v63() test_pl = input_loader.GoldenLoader().load_pseudo_labels() whole_sub = pd.DataFrame() whole_sub["card_id"] = test["card_id"] whole_sub["target"] = 0 whole_cv = pd.DataFrame() whole_cv["card_id"] = train["card_id"] whole_cv["cv_pred"] = 0 skf = model_selection.KFold(n_splits=5, shuffle=True, random_state=4590) for pl_use_idx, pl_sub_idx in skf.split(test_pl): use_pl = test_pl.iloc[pl_use_idx] sub_pl = test_pl.iloc[pl_sub_idx] use_pl = pd.merge(use_pl, test, on="card_id", how="left") sub_pl = pd.merge(sub_pl, test, on="card_id", how="left")