Esempio n. 1
0
def tune_num_round(num_round=5):
    """Use cross validation to find the optimal num_round
       num_round=462 gives the minimum test-mlogloss-mean
    """
    #history = pd.DataFrame(history, columns=['test-mlogloss-mean',  'test-mlogloss-std',
    #                                         'train-mlogloss-mean', 'train-mlogloss-std'])    # the columns of history
    X1, target, v_train, v_test = feature_extraction(useUpc=True)
    y = pd.get_dummies(target).values.argmax(1)
    N = X1.shape[0]
    seed = 137
    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': 38,
        'eta': .2,
        'max_depth': 5,
        'colsample_bytree': .4,
        'subsample': .8,
        'silent': 1,
        'eval_metric': 'mlogloss',
        'seed': seed
    }
    dtrain = xgb.DMatrix(X1[v_train - 1], label=y)
    dtest = xgb.DMatrix(X1[v_test - 1])
    history = xgb.cv(xgb_params,
                     dtrain,
                     num_round,
                     nfold=3,
                     stratified=True,
                     metrics='mlogloss',
                     verbose_eval=True,
                     early_stopping_rounds=50)
    np.save(log_path + 'num_round_tuning.npy', history)
    plt.errorbar(range(num_round),
                 history['train-mlogloss-mean'],
                 history['train-mlogloss-std'],
                 linestyle='None',
                 marker='s',
                 label='train',
                 mfc=None,
                 ms=2)
    plt.errorbar(range(num_round),
                 history['test-mlogloss-mean'],
                 history['test-mlogloss-std'],
                 linestyle='None',
                 marker='o',
                 label='test',
                 mfc=None,
                 ms=2)
    plt.legend()
    plt.xlabel('Num_round')
    plt.ylabel('mlogloss')
    plt.savefig(log_path + 'cv.eps', format='eps', dpi=1000)
Esempio n. 2
0
def tune_params(params):
    """Use cross validation to find the optimal parameter
    
    Args:
         params: dict, a dict of parameters to tune, the values of the dict must be a list
    Returns:
         best_params: best parameters found by cross validation
         cv_result: detailed result of cross validation
    """

    X1, target, v_train, v_test = feature_extraction(useUpc=True)
    y = pd.get_dummies(target).values.argmax(1)
    N = X1.shape[0]
    seed = 157
    xgb_params = {
        'learning_rate': [.2],
        'n_estimators': [3],
        'gamma': [0],
        'max_depth': [5],
        'min_child_weight': [1],
        'subsample': [1],
        'colsample_bytree': [.4],
        'colsample_bylevel': [.8],
        'reg_alpha': [0],
        'reg_lambda': [1]
    }
    xgb_params.update(params)
    clf = xgb.XGBClassifier(silent=True, objective='multi:softprob', seed=seed)
    bst = GridSearchCV(
        clf, xgb_params, scoring='neg_log_loss', cv=3, refit=False).fit(
            X1[v_train - 1],
            y)  # don't specify n_jobs in GridSearchCV. It seems that
    # lauching multiple xgb will make xgb crash. xgb has built-in multi-processing already
    best_params = bst.best_params_
    cv_result = bst.cv_results_
    return best_params, cv_result
import scipy as sp
import pandas as pd
from datetime import datetime
import xgboost as xgb
from sklearn.metrics import log_loss

from utility_common import feature_extraction, data_path

# r087
# 2015/12/16 14h20m
# Ensemble
# XGB
# params: nt (=num_round)

# ncol: 138610
X1, target, v_train, v_test = feature_extraction(useUpc=True)
y = pd.get_dummies(target).values.argmax(1)

X1 = X1[v_train-1]

nModels = 10

sh = .2
cs = .4
bf = .8
xgb_params = {'eta':sh, 'silent':1, 'objective':'multi:softprob', 'num_class':38,
              'colsample_bytree':cs, 'subsample':bf,
              'eval_metric':'mlogloss', 'nthread':8}

nt_dict = {4:range(500, 951, 50), 5:range(300, 701, 50)}
import pandas as pd
import xgboost as xgb
from datetime import datetime
from sklearn.metrics import log_loss

from lasagne.layers import InputLayer, DropoutLayer, DenseLayer

from utility_common import feature_extraction, data_path
from utility_nn import build_net_sparse_input
from utility_xgb import feature_selection

# NN [6000+, 60, 100, 38], [6000+, 70, 90, 38]
# 2015/12/25-26 21h 

# X2.shape[1]:13916
X2, target, v_train, v_test = feature_extraction(useUpc=False)
y = pd.get_dummies(target).values.argmax(1)
N = X2.shape[0]

# X2[v_train-1]: training
# X2[v_test-1]: test

# Parameters
# r096, r104
nModels = 50
lr = .02
mm = .2
p = .1
bs = 256
params_lst = [{'h1':60, 'h2':100, 'max_epochs':390}, {'h1':70, 'h2':90, 'max_epochs':410}]
"""
import numpy as np
import scipy as sp
import pandas as pd
from datetime import datetime
import xgboost as xgb
from sklearn.metrics import log_loss

from utility_common import feature_extraction, data_path

# XGB
# 2015/12/25 7h22m
nModels = 50

# X1.shape[1]: 138610
X1, target, v_train, v_test = feature_extraction(useUpc=True)
y = pd.get_dummies(target).values.argmax(1)
N = X1.shape[0]

# Parameters
# r087

num_round = 550
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 38,
    'eta': .2,
    'max_depth': 5,
    'colsample_bytree': .4,
    'subsample': .8,
    'silent': 1,
from utility_common import feature_extraction, data_path
from utility_nn import build_net_sparse_input
from utility_xgb import feature_selection

# r096
# 2015/12/23-24 1 day, 1:47:32.535909
# (h1,h2):(60, 100), (70, 90)
# CV pred
# Feature selection by XGB
# Shuffle data, No scaling, No nomalizing
# NN 2 hidden layers
# params: epochs (=max_epochs)

# ncol:13916
X4, target, v_train, v_test = feature_extraction(training, test, useUpc=False)
N = X4.shape[0]
X4 = X4[v_train-1]

# params for xgb
nt = 400
tc = 6
sh = .2
cs = .4
bf = .8
xgb_params = {'eta':sh, 'silent':1, 'objective':'multi:softprob', 'num_class':38,
              'max_depth':tc, 'colsample_bytree':cs, 'subsample':bf,
              'eval_metric':'mlogloss', 'nthread':8}

# params for nn
nModels = 20