def build_rf_submission():
    X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False)
    # Right now we look at an extra y_train, y_test to assess the quality of our cv-estimates.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42)
    print "Run Random Forest with {} data points and {} features.".format(X_train.shape[0], X_train.shape[1])
    t0 = time()
    grid_cv = find_cv_rf_model(X_train, y_train, grid=False)  # stochastic search now
    best_clf = grid_cv.best_estimator_
    y_pred = best_clf.predict_proba(X_test)
    print "Done in %0.3fs" % (time() - t0)
    print "Best params {}: ".format(grid_cv.best_params_)
    print "Best CV score {}: ".format(grid_cv.best_score_)
    print "Training log-loss: {}".format(log_loss(y_train, best_clf.predict_proba(X_train)))
    print "Training accuracy: {}".format(best_clf.score(X_train, y_train))
    print "Test log-loss: {}".format(log_loss(y_test, y_pred))
    print "Test accuracy: {}".format(best_clf.score(X_test, y_test))

    submission_name = "submission_{}.csv".format(time())
    util.note_submission_info("Model: {}".format(best_clf), submission_name)
    util.build_submission(best_clf, X_holdout, ids, submission_name)
Beispiel #2
0
    encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True, handle_unknown="ignore")
    n_rows = X.shape[0]

    X[:, cat_idx] = X[:, cat_idx] + 1
    X_holdout[:, cat_idx] = X_holdout[:, cat_idx] + 1

    X = encoder.fit_transform(X)
    X_holdout = encoder.transform(X_holdout)

    print "Run LogReg with {} data points and {} features.".format(X.shape[0], X.shape[1])
    t0 = time()
    #grid_cv = find_cv_rf_model(X, y)
    #best_clf = grid_cv.best_estimator_

    best_clf = LogisticRegression(C=0.25, penalty='l1')
    best_clf.fit(X, y)

    # CV log-loss -0.483927951479 (+- 0.00121618173757) for C = 0.1
    # CV log-loss -0.484415686138 (+- 0.00105883337937) for C = 0.5
    # CV log-loss -0.483759409049 (+- 0.00121414467795) for C = 0.25
    print "Done in %0.3fs" % (time() - t0)
    cv_scores = cross_validation.cross_val_score(best_clf, X, y, scoring='log_loss',  n_jobs=7, cv=7, verbose=1)
    print "CV log-loss {} (+- {})".format(cv_scores.mean(), cv_scores.std())

    M = np.hstack((best_clf.predict_proba(X)[:, 1], best_clf.predict_proba(X_holdout)[:, 1]))
    print "Probability output shape: {}".format(M.shape) # just to be sure we do not mess up here
    M.tofile("./features/logreg_features.npy")

    submission_name = "submission_lr_{}.csv".format(time())
    util.note_submission_info("Model: {}\n".format(best_clf), submission_name)
    util.build_submission(best_clf, X_holdout, ids, submission_name)