コード例 #1
0
"""
    This file is for preparing data as below.
    - Translate Japanese strings into English.
    - Replace hash strings with ids.
    - Add period to coupons.
"""

import pandas as pd
import numpy as np
from util_logger import get_logger
LOG = get_logger()
LOG.info("a00")

# 0. load files ----------------------------------------

user_list = pd.read_csv("../input/user_list.csv")
coupon_area_train = pd.read_csv("../input/coupon_area_train.csv")
coupon_area_test = pd.read_csv("../input/coupon_area_test.csv")
coupon_list_train  = pd.read_csv("../input/coupon_list_train.csv")
coupon_list_test = pd.read_csv("../input/coupon_list_test.csv")
coupon_detail_train = pd.read_csv("../input/coupon_detail_train.csv")
coupon_visit_train = pd.read_csv("../input/coupon_visit_train.csv")

def csv_to_dict(path):
    df = pd.read_csv(path)
    return dict([(r.jp, r.en) for i, r in df.iterrows()])

dict_SMALLAREA = csv_to_dict("../input/SMALLAREA.csv")
dict_PREF = csv_to_dict("../input/PREF.csv")
dict_GENRE = csv_to_dict("../input/GENRE.csv")
dict_LARGEAREA = csv_to_dict("../input/LARGEAREA.csv")
コード例 #2
0
"""
    This file runs xgboost, train with train data and predict test data.
"""

import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.externals import joblib
from sklearn.cross_validation import StratifiedKFold
from util_logger import get_logger

import sys
argvs = sys.argv
_, runtype, version = argvs
LOG = get_logger()
LOG.info("start e01")


def run_xgboost(labels, weights, data):

    # convert data into xgb.DMatrix.
    # train using 80% of the data, 20% of the data is used for watchlist
    skf = StratifiedKFold(labels, 5, random_state=123)
    idx_train, idx_test = list(skf)[0]

    dtrain = xgb.DMatrix(data[idx_train, :],
                         weight=weights[idx_train],
                         label=labels[idx_train],
                         missing=np.nan)
    dvalid = xgb.DMatrix(data[idx_test, :],