def build_rf_features():
    X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)

    X1, X2 = load_extra_features()
    X = np.hstack((X, X1))
    X_holdout = np.hstack((X_holdout, X2))

    X, X_test, y, y_test = train_test_split(X, y, test_size=0.2)

    t0 = time()
    for max_depth in range(1, 15):
        print "max depth {}".format(max_depth)
        rf_clf = RandomForestClassifier(n_estimators=200, max_depth=max_depth, criterion="entropy", n_jobs=-1)
        rf_clf.fit(X, y)

        print "Done in %0.3fs" % (time() - t0)
        print log_loss(y_test, rf_clf.predict_proba(X_test))
Beispiel #2
0
from extraction import prepare_data, load_extra_features
import xgboost as xgb
import numpy as np
import pandas as pd
from time import time
from sklearn import cross_validation

if __name__ == "__main__":
    X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)
    X_extra, X_holdout_extra = load_extra_features()

    params = {
       "objective"  : "binary:logistic",
       "eval_metric" : "logloss",
       "eta" : 0.005, # 0.01
       "subsample" : 0.8,
       "colsample_bytree" : 0.8,
       "min_child_weight" : 1,
       "max_depth" : 10
    }

    xg_train = xgb.DMatrix(np.hstack((X, X_extra)), label=y)
    xg_test = xgb.DMatrix(np.hstack((X_holdout, X_holdout_extra)))

    #xg_train = xgb.DMatrix(X, label=y)

    xgb_clf = xgb.train(params, xg_train, num_boost_round=2500, verbose_eval=True, maximize=False)

    y_pred = xgb_clf.predict(xg_test)# ,ntree_limit=xgb_clf.best_iteration)
    #cv_scores = xgb.cv(params, xg_train, num_boost_round=100, nfold=5, metrics="logloss", seed=42, early_stopping_rounds=5)
    #print cv_scores