def build_rf_features(): X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False) X1, X2 = load_extra_features() X = np.hstack((X, X1)) X_holdout = np.hstack((X_holdout, X2)) X, X_test, y, y_test = train_test_split(X, y, test_size=0.2) t0 = time() for max_depth in range(1, 15): print "max depth {}".format(max_depth) rf_clf = RandomForestClassifier(n_estimators=200, max_depth=max_depth, criterion="entropy", n_jobs=-1) rf_clf.fit(X, y) print "Done in %0.3fs" % (time() - t0) print log_loss(y_test, rf_clf.predict_proba(X_test))
from extraction import prepare_data, load_extra_features import xgboost as xgb import numpy as np import pandas as pd from time import time from sklearn import cross_validation if __name__ == "__main__": X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False) X_extra, X_holdout_extra = load_extra_features() params = { "objective" : "binary:logistic", "eval_metric" : "logloss", "eta" : 0.005, # 0.01 "subsample" : 0.8, "colsample_bytree" : 0.8, "min_child_weight" : 1, "max_depth" : 10 } xg_train = xgb.DMatrix(np.hstack((X, X_extra)), label=y) xg_test = xgb.DMatrix(np.hstack((X_holdout, X_holdout_extra))) #xg_train = xgb.DMatrix(X, label=y) xgb_clf = xgb.train(params, xg_train, num_boost_round=2500, verbose_eval=True, maximize=False) y_pred = xgb_clf.predict(xg_test)# ,ntree_limit=xgb_clf.best_iteration) #cv_scores = xgb.cv(params, xg_train, num_boost_round=100, nfold=5, metrics="logloss", seed=42, early_stopping_rounds=5) #print cv_scores