def build_rf_submission(): X, y, X_holdout, ids = prepare_data("./data/", drop_categorical=False) # Right now we look at an extra y_train, y_test to assess the quality of our cv-estimates. X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42) print "Run Random Forest with {} data points and {} features.".format(X_train.shape[0], X_train.shape[1]) t0 = time() grid_cv = find_cv_rf_model(X_train, y_train, grid=False) # stochastic search now best_clf = grid_cv.best_estimator_ y_pred = best_clf.predict_proba(X_test) print "Done in %0.3fs" % (time() - t0) print "Best params {}: ".format(grid_cv.best_params_) print "Best CV score {}: ".format(grid_cv.best_score_) print "Training log-loss: {}".format(log_loss(y_train, best_clf.predict_proba(X_train))) print "Training accuracy: {}".format(best_clf.score(X_train, y_train)) print "Test log-loss: {}".format(log_loss(y_test, y_pred)) print "Test accuracy: {}".format(best_clf.score(X_test, y_test)) submission_name = "submission_{}.csv".format(time()) util.note_submission_info("Model: {}".format(best_clf), submission_name) util.build_submission(best_clf, X_holdout, ids, submission_name)
encoder = OneHotEncoder(categorical_features=cat_idx, sparse=True, handle_unknown="ignore") n_rows = X.shape[0] X[:, cat_idx] = X[:, cat_idx] + 1 X_holdout[:, cat_idx] = X_holdout[:, cat_idx] + 1 X = encoder.fit_transform(X) X_holdout = encoder.transform(X_holdout) print "Run LogReg with {} data points and {} features.".format(X.shape[0], X.shape[1]) t0 = time() #grid_cv = find_cv_rf_model(X, y) #best_clf = grid_cv.best_estimator_ best_clf = LogisticRegression(C=0.25, penalty='l1') best_clf.fit(X, y) # CV log-loss -0.483927951479 (+- 0.00121618173757) for C = 0.1 # CV log-loss -0.484415686138 (+- 0.00105883337937) for C = 0.5 # CV log-loss -0.483759409049 (+- 0.00121414467795) for C = 0.25 print "Done in %0.3fs" % (time() - t0) cv_scores = cross_validation.cross_val_score(best_clf, X, y, scoring='log_loss', n_jobs=7, cv=7, verbose=1) print "CV log-loss {} (+- {})".format(cv_scores.mean(), cv_scores.std()) M = np.hstack((best_clf.predict_proba(X)[:, 1], best_clf.predict_proba(X_holdout)[:, 1])) print "Probability output shape: {}".format(M.shape) # just to be sure we do not mess up here M.tofile("./features/logreg_features.npy") submission_name = "submission_lr_{}.csv".format(time()) util.note_submission_info("Model: {}\n".format(best_clf), submission_name) util.build_submission(best_clf, X_holdout, ids, submission_name)