def eval_model(): comments, labels = load_extended_data() clf1 = build_base_model() clf2 = build_elasticnet_model() clf3 = build_stacked_model() clf4 = build_nltk_model() models = [clf1, clf2, clf3, clf4] #models = [clf1] cv = ShuffleSplit(len(comments), n_iterations=5, test_size=0.2, indices=True) scores = [] for train, test in cv: probs_common = np.zeros((len(test), 2)) for clf in models: X_train, y_train = comments[train], labels[train] X_test, y_test = comments[test], labels[test] clf.fit(X_train, y_train) probs = clf.predict_proba(X_test) print("score: %f" % auc_score(y_test, probs[:, 1])) probs_common += probs probs_common /= 4. scores.append(auc_score(y_test, probs_common[:, 1])) print("combined score: %f" % scores[-1]) print(np.mean(scores), np.std(scores))
def grid_search(): comments, labels = load_data() param_grid = dict(logr__C=np.arange(1, 20, 5)) clf = build_nltk_model() cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2) grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4, n_jobs=12, score_func=auc_score) grid.fit(comments, labels) print(grid.best_score_) print(grid.best_params_) tracer() cv_scores = grid.scores_ for param in cv_scores.params: means, errors = cv_scores.accumulated(param, 'max') plt.errorbar(cv_scores.values[param], means, yerr=errors) plt.xlabel(param) plt.ylim((0.85, 0.93)) plt.savefig("grid_plot_%s.png" % param) plt.close() comments_test, dates_test = load_test() prob_pred = grid.best_estimator_.predict_proba(comments_test) write_test(prob_pred[:, 1])
def apply_models(): comments, labels = load_extended_data() comments_test = load_test("impermium_verification_set_.csv") clf1 = build_base_model() clf2 = build_elasticnet_model() clf3 = build_stacked_model() clf4 = build_nltk_model() models = [clf1, clf2, clf3, clf4] probs_common = np.zeros((len(comments_test), 2)) for i, clf in enumerate(models): clf.fit(comments, labels) probs = clf.predict_proba(comments_test) #print("score: %f" % auc_score(labels_test, probs[:, 1])) probs_common += probs write_test(probs[:, 1], "test_prediction_model_%d.csv" % i, ds="impermium_verification_set_.csv") probs_common /= 4. #score = auc_score(labels_test, probs_common[:, 1]) #print("combined score: %f" % score) write_test(probs_common[:, 1], "test_prediction_combined.csv", ds="impermium_verification_set_.csv")