/
xgboost_experiment.py
33 lines (27 loc) · 1.17 KB
/
xgboost_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from extraction import prepare_data, load_extra_features
import xgboost as xgb
import numpy as np
import pandas as pd
from time import time
from sklearn import cross_validation
if __name__ == "__main__":
X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)
X_extra, X_holdout_extra = load_extra_features()
params = {
"objective" : "binary:logistic",
"eval_metric" : "logloss",
"eta" : 0.005, # 0.01
"subsample" : 0.8,
"colsample_bytree" : 0.8,
"min_child_weight" : 1,
"max_depth" : 10
}
xg_train = xgb.DMatrix(np.hstack((X, X_extra)), label=y)
xg_test = xgb.DMatrix(np.hstack((X_holdout, X_holdout_extra)))
#xg_train = xgb.DMatrix(X, label=y)
xgb_clf = xgb.train(params, xg_train, num_boost_round=2500, verbose_eval=True, maximize=False)
y_pred = xgb_clf.predict(xg_test)# ,ntree_limit=xgb_clf.best_iteration)
#cv_scores = xgb.cv(params, xg_train, num_boost_round=100, nfold=5, metrics="logloss", seed=42, early_stopping_rounds=5)
#print cv_scores
df = pd.DataFrame({'ID': ids, 'PredictedProb': y_pred})
df.to_csv("submission_xgb_ensemble_{}.csv".format(time()), index=False)