""" This file is for preparing data as below. - Translate Japanese strings into English. - Replace hash strings with ids. - Add period to coupons. """ import pandas as pd import numpy as np from util_logger import get_logger LOG = get_logger() LOG.info("a00") # 0. load files ---------------------------------------- user_list = pd.read_csv("../input/user_list.csv") coupon_area_train = pd.read_csv("../input/coupon_area_train.csv") coupon_area_test = pd.read_csv("../input/coupon_area_test.csv") coupon_list_train = pd.read_csv("../input/coupon_list_train.csv") coupon_list_test = pd.read_csv("../input/coupon_list_test.csv") coupon_detail_train = pd.read_csv("../input/coupon_detail_train.csv") coupon_visit_train = pd.read_csv("../input/coupon_visit_train.csv") def csv_to_dict(path): df = pd.read_csv(path) return dict([(r.jp, r.en) for i, r in df.iterrows()]) dict_SMALLAREA = csv_to_dict("../input/SMALLAREA.csv") dict_PREF = csv_to_dict("../input/PREF.csv") dict_GENRE = csv_to_dict("../input/GENRE.csv") dict_LARGEAREA = csv_to_dict("../input/LARGEAREA.csv")
""" This file runs xgboost, train with train data and predict test data. """ import pandas as pd import numpy as np import gc import xgboost as xgb from sklearn.externals import joblib from sklearn.cross_validation import StratifiedKFold from util_logger import get_logger import sys argvs = sys.argv _, runtype, version = argvs LOG = get_logger() LOG.info("start e01") def run_xgboost(labels, weights, data): # convert data into xgb.DMatrix. # train using 80% of the data, 20% of the data is used for watchlist skf = StratifiedKFold(labels, 5, random_state=123) idx_train, idx_test = list(skf)[0] dtrain = xgb.DMatrix(data[idx_train, :], weight=weights[idx_train], label=labels[idx_train], missing=np.nan) dvalid = xgb.DMatrix(data[idx_test, :],