/
5.2.hyperopt.py
101 lines (85 loc) · 4.55 KB
/
5.2.hyperopt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import logging
import pandas as pd
import xgboost as xgb
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from params import col_nuls,col_less_10, RANDOM_STATE, params_xgb
space = {
'eta': 0.01,
'max_depth': hp.quniform('max_depth', 3, 20, 1),
'min_child_weight': hp.quniform('min_child_weight', 1, 7, 1),
'subsample': hp.quniform('subsample', 0.6, 1, 0.1),
'gamma': hp.quniform('gamma', 0, 0.1, 0.01),
'alpha': hp.quniform('alpha', 0, 1, 0.1),
'lambda': hp.quniform('lambda', 0, 1, 0.1),
'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.1),
'objective': 'binary:logistic',
'eval_metric': 'auc',
'booster': 'gbtree',
'seed': RANDOM_STATE
}
def score(params):
global xgb_train
# seed = int(np.random.rand()*100000)
params['max_depth'] = int(params['max_depth'])
params['min_child_weight'] = int(params['min_child_weight'])
logger.info('Training with params:')
logger.info(params)
cv_res = xgb.cv(params, xgb_train, num_boost_round=2000, nfold=5, stratified=True,early_stopping_rounds=30)
score = np.max(cv_res['test-auc-mean'])
logger.info('score = %f' % score)
logger.info('best_rounds = %f' % np.argmax(cv_res['test-auc-mean']))
return {'loss': -score, 'status': STATUS_OK}
if __name__ == "__main__":
logger = logging.getLogger('server_logger')
logger.setLevel(logging.INFO)
# create file handler which logs even debug messages
fh = logging.FileHandler('server_xgb.log')
fh.setLevel(logging.INFO)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# add the handlers to logger
logger.addHandler(ch)
logger.addHandler(fh)
# logger.info('This should go to both console and file')
# test = pd.concat([pd.read_pickle('data/aggregation/test.pkl', compression='gzip'),
# pd.read_pickle('data/cross/test.pkl', compression='gzip'),
# pd.read_pickle('data/double_cross/test.pkl', compression='gzip')],axis=1)
# ids_test = pd.read_pickle('data/drop_duplicates/ids_test.pkl', compression='gzip')
train = pd.concat([pd.read_pickle('data/aggregation/train.pkl', compression='gzip'),
pd.read_pickle('data/cross/train.pkl', compression='gzip'),
pd.read_pickle('data/double_cross/train.pkl', compression='gzip')], axis=1)
ids_train = pd.read_pickle('data/drop_duplicates/ids_train.pkl', compression='gzip')
target = pd.read_pickle('data/drop_duplicates/target.pkl', compression='gzip')
train = train[list(set(train.columns.values) - set(col_nuls) - set(col_less_10))]
xgb_train = xgb.DMatrix(train, label=target)
trials = Trials()
best = fmin(fn=score,
space=space,
algo=tpe.suggest,
trials=trials,
max_evals=150
)
'''
2017-08-31 15:22:56 - INFO - Training with params:
2017-08-31 15:22:56 - INFO - {'colsample_bytree': 1.0, 'eval_metric': 'auc', 'min_child_weight': 6, 'subsample': 0.6000000000000001, 'eta': 0.01, 'objective': 'binary:logistic', 'alpha': 0.9, 'booster': 'gbtree', 'seed': 2017, 'max_depth': 4, 'gamma': 0.05, 'lambda': 0.6000000000000001}
2017-08-31 15:42:43 - INFO - score = 0.701148
2017-08-31 15:42:43 - INFO - best_rounds = 1296.000000
2017-08-31 16:06:34 - INFO - Training with params:
2017-08-31 16:06:34 - INFO - {'colsample_bytree': 1.0, 'eval_metric': 'auc', 'min_child_weight': 6, 'subsample': 0.6000000000000001, 'eta': 0.01, 'objective': 'binary:logistic', 'alpha': 0.5, 'booster': 'gbtree', 'seed': 2017, 'max_depth': 7, 'gamma': 0.01, 'lambda': 0.6000000000000001}
2017-08-31 16:32:25 - INFO - score = 0.700087
2017-08-31 16:32:25 - INFO - best_rounds = 970.000000
2017-08-31 16:32:25 - INFO - Training with params:
2017-08-31 16:32:25 - INFO - {'colsample_bytree': 0.9, 'eval_metric': 'auc', 'min_child_weight': 6, 'subsample': 0.6000000000000001, 'eta': 0.01, 'objective': 'binary:logistic', 'alpha': 0.4, 'booster': 'gbtree', 'seed': 2017, 'max_depth': 4, 'gamma': 0.03, 'lambda': 0.7000000000000001}
2017-08-31 16:50:31 - INFO - score = 0.700914
2017-08-31 16:50:31 - INFO - best_rounds = 1294.000000
2017-08-31 17:44:39 - INFO - Training with params:
2017-08-31 17:44:39 - INFO - {'colsample_bytree': 1.0, 'eval_metric': 'auc', 'min_child_weight': 7, 'subsample': 0.7000000000000001, 'eta': 0.01, 'objective': 'binary:logistic', 'alpha': 1.0, 'booster': 'gbtree', 'seed': 2017, 'max_depth': 5, 'gamma': 0.01, 'lambda': 0.4}
2017-08-31 18:06:31 - INFO - score = 0.700996
2017-08-31 18:06:31 - INFO - best_rounds = 1160.000000
'''