Ejemplo n.º 1
0
    def make_files(self, files):
        train, test = input_loader.GoldenLoader().load_team_input_v63()
        # train = self.csv_io.read_file(path_const.ORG_TRAIN)
        train["has_new"] = np.where(train["new_purchase_amount_count"] >= 1, 1, 0)
        train = train[["card_id", "target", "has_new"]]
        # test = self.csv_io.read_file(path_const.ORG_TEST)
        test["has_new"] = np.where(test["new_purchase_amount_count"] >= 1, 1, 0)
        test = test[["card_id", "has_new"]]

        for f in files:
            train, test = self.add_file(f[0], f[1], train, test)
        return train, test
Ejemplo n.º 2
0
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

loader = input_loader.GoldenLoader()
_train, _test = loader.load_whole_input()
timer.time("load csv")
pred_col = loader.small_col


def try_some(train, test):
    train_x, test_x = train[pred_col], test[pred_col]
    train_y = train["target"]
    print(train_x.shape)
    print(train_y.shape)
    print(test_x.shape)

    submission = pd.DataFrame()
    submission["card_id"] = test["card_id"]
    submission["target"] = 0
Ejemplo n.º 3
0
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

data = input_loader.GoldenLoader().load_small_pred_new()
train, test, train_x, train_y, test_x = data
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(train_y.describe())
train_y = train_y.fillna(60)
print(train_x.columns)
timer.time("load csv")

mean_val = 6.040527
train_x["mean_val"] = mean_val
rmse_score = evaluator.rmse(train_y, train_x["mean_val"])
print(rmse_score)

pred_col_name = "pred_new"
Ejemplo n.º 4
0
import os, sys

ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.loader import input_loader

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

train, test = input_loader.GoldenLoader().load_team_input_v63()
org_train = csv_io.read_file(path_const.ORG_TRAIN)[["card_id"]]
org_test = csv_io.read_file(path_const.ORG_TEST)[["card_id"]]
timer.time("load csv")
print(train["card_id"].head())
print(org_train["card_id"].head())

cat_cols = input_loader.GoldenLoader().get_team_cat_col()

has_col = [c for c in cat_cols if c in train.columns]
print(has_col)

print(train[has_col].nunique())
Ejemplo n.º 5
0
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

train, test = input_loader.GoldenLoader().load_team_input_v63()
timer.time("load csv")

drop_col = ["card_id", "target", "outliers"]
pred_col = [c for c in train.columns if c not in drop_col]
train_x = train[pred_col]
train_y = train["target"]
test_x = test[pred_col]
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

submission = pd.DataFrame()
submission["card_id"] = test["card_id"]
submission["target"] = 0
train_cv = pd.DataFrame()
Ejemplo n.º 6
0
import os, sys
ROOT = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

data = input_loader.GoldenLoader().load_small_pred_new("new_purchase_amount_max")
train, test, train_x, train_y, test_x = data
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(train_y.describe())
train_y = train_y.fillna(-0.746893)
print(train_x.columns)
timer.time("load csv")

mean_val = -0.198848
train_x["mean_val"] = mean_val
rmse_score = evaluator.rmse(train_y, train_x["mean_val"])
print(rmse_score)

pred_col_name = "pred_new_pur_max"
Ejemplo n.º 7
0
import os, sys
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io
from elo.loader import input_loader

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

train, test = input_loader.GoldenLoader().load_small_for_share()
timer.time("load csv")

train.to_pickle("../output/pocket_train_small_feats.pkl")
test.to_pickle("../output/pocket_test_small_feats.pkl")
Ejemplo n.º 8
0
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection
import optuna

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

data = input_loader.GoldenLoader().load_small_input()
timer.time("load csv")

train, test, train_x, train_y, test_x = data
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

outliers = (train["target"] < -30).astype(int).values
split_num = 5
random_state = 4590
skf = model_selection.StratifiedKFold(n_splits=split_num,
                                      shuffle=True,
                                      random_state=random_state)
total_score = 0
models = []
Ejemplo n.º 9
0
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

data = input_loader.GoldenLoader().load_large_input()
timer.time("load csv")

train, test, train_x, train_y, test_x = data
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

submission = pd.DataFrame()
submission["card_id"] = test["card_id"]
submission["target"] = 0
train_cv = pd.DataFrame()
train_cv["card_id"] = train["card_id"]
train_cv["cv_pred"] = 0

outliers = (train["target"] < -30).astype(int).values
Ejemplo n.º 10
0
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

data = input_loader.GoldenLoader().load_medium_with_team()
timer.time("load csv")

train, test, train_x, train_y, test_x = data
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

submission = pd.DataFrame()
submission["card_id"] = test["card_id"]
submission["target"] = 0
train_cv = pd.DataFrame()
train_cv["card_id"] = train["card_id"]
train_cv["cv_pred"] = 0

outliers = (train["target"] < -30).astype(int).values
Ejemplo n.º 11
0
import os, sys
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)
import numpy as np
import pandas as pd
import datetime
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.loader import input_loader

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

train, test = input_loader.GoldenLoader().load_small_input()
Ejemplo n.º 12
0
ROOT = os.path.abspath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

data = input_loader.GoldenLoader().load_medium_input()
timer.time("load csv")

train, test, train_x, train_y, test_x = data
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

submission = pd.DataFrame()
submission["card_id"] = test["card_id"]
submission["target"] = 0
train_cv = pd.DataFrame()
train_cv["card_id"] = train["card_id"]
train_cv["cv_pred"] = 0

outliers = (train["target"] < -30).astype(int).values
Ejemplo n.º 13
0
sys.path.append(ROOT)

import pandas as pd
import numpy as np
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const
from elo.common import pocket_lgb, evaluator
from elo.loader import input_loader
from sklearn import model_selection

logger = pocket_logger.get_my_logger()
timer = pocket_timer.GoldenTimer(logger)
csv_io = pocket_file_io.GoldenCsv()

# base score=3.636285831929972
# pl in a cv way
train, test = input_loader.GoldenLoader().load_team_input_v63()
test_pl = input_loader.GoldenLoader().load_pseudo_labels()

whole_sub = pd.DataFrame()
whole_sub["card_id"] = test["card_id"]
whole_sub["target"] = 0
whole_cv = pd.DataFrame()
whole_cv["card_id"] = train["card_id"]
whole_cv["cv_pred"] = 0

skf = model_selection.KFold(n_splits=5, shuffle=True, random_state=4590)
for pl_use_idx, pl_sub_idx in skf.split(test_pl):
    use_pl = test_pl.iloc[pl_use_idx]
    sub_pl = test_pl.iloc[pl_sub_idx]
    use_pl = pd.merge(use_pl, test, on="card_id", how="left")
    sub_pl = pd.merge(sub_pl, test, on="card_id", how="left")