def explore_features(): comments, labels = load_extended_data() ft = TextFeatureTransformer() features, flat_words_lower, filtered_words, comments_filtered = \ ft._preprocess(comments) asdf = [" ".join(w) for w in filtered_words] np.savetxt("filtered.txt", asdf, fmt="%s")
def analyze_output(): comments, labels = load_data() y_train, y_test, comments_train, comments_test = \ train_test_split(labels, comments, random_state=1) #from sklearn.tree import DecisionTreeClassifier #bad = BadWordCounter() #custom = bad.transform(comments_train) clf = LogisticRegression(tol=1e-8, penalty='l2', C=1.5) #clf = DecisionTreeClassifier(compute_importances=True,min_samples_leaf=10) ft = TextFeatureTransformer().fit(comments_train, y_train) X_train = ft.transform(comments_train) #select = SelectPercentile(score_func=chi2, percentile=7) #X_train_s = select.fit_transform(X_train, y_train) X_test = ft.transform(comments_test) clf.fit(X_train, y_train) #from sklearn.tree import export_graphviz #export_graphviz(clf, "tree3.dot", ft.get_feature_names()) #tracer() #X_test_s = select.transform(X_test) probs = clf.predict_proba(X_test) pred = clf.predict(X_test) pred_train = clf.predict(X_train) probs_train = clf.predict_proba(X_train) print("auc: %f" % auc_score(y_test, probs[:, 1])) print("auc train: %f" % auc_score(y_train, probs_train[:, 1])) fp_train = np.where(pred_train > y_train)[0] fn_train = np.where(pred_train < y_train)[0] fn_comments_train = comments_train[fn_train] fp_comments_train = comments_train[fp_train] n_bad_train = X_train[:, -22].toarray().ravel() fn_comments_train = np.vstack([ fn_train, n_bad_train[fn_train], probs_train[fn_train][:, 1], fn_comments_train ]).T fp_comments_train = np.vstack([ fp_train, n_bad_train[fp_train], probs_train[fp_train][:, 1], fp_comments_train ]).T fp = np.where(pred > y_test)[0] fn = np.where(pred < y_test)[0] fn_comments = comments_test[fn] fp_comments = comments_test[fp] n_bad = X_test[:, -2].toarray().ravel() fn_comments = np.vstack([fn, n_bad[fn], probs[fn][:, 1], fn_comments]).T fp_comments = np.vstack([fp, n_bad[fp], probs[fp][:, 1], fp_comments]).T # visualize important features #important = np.abs(clf.coef_.ravel()) > 0.001 #coef_ = select.inverse_transform(clf.coef_) coef_ = clf.coef_ important = np.argsort(np.abs(coef_.ravel()))[-100:] feature_names = ft.get_feature_names() f_imp = feature_names[important] coef = coef_.ravel()[important] inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] plt.plot(coef, label="l1") ax = plt.gca() ax.set_xticks(np.arange(len(coef))) labels = ax.set_xticklabels(f_imp) for label in labels: label.set_rotation(90) plt.savefig("ana.png", bbox_inches="tight") plt.show() def about(comment_num): print(comments_test[comment_num]) inds = np.where(X_test[comment_num].toarray())[1] coef_com = coef_.ravel()[inds] feat_entries = X_test[comment_num, inds].toarray().ravel() sorting = np.argsort(coef_com * feat_entries) blub = np.vstack([ feature_names[inds][sorting], feat_entries[sorting], coef_com[sorting] ]).T print(blub) tracer()
def build_nltk_model(): select = SelectPercentile(score_func=chi2, percentile=36) clf = LogisticRegression(tol=1e-8, penalty='l2', C=2) ft = TextFeatureTransformer() pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)]) return pipeline
def analyze_output(): comments, labels = load_data() y_train, y_test, comments_train, comments_test = \ train_test_split(labels, comments, random_state=1) #from sklearn.tree import DecisionTreeClassifier #bad = BadWordCounter() #custom = bad.transform(comments_train) clf = LogisticRegression(tol=1e-8, penalty='l2', C=1.5) #clf = DecisionTreeClassifier(compute_importances=True,min_samples_leaf=10) ft = TextFeatureTransformer().fit(comments_train, y_train) X_train = ft.transform(comments_train) #select = SelectPercentile(score_func=chi2, percentile=7) #X_train_s = select.fit_transform(X_train, y_train) X_test = ft.transform(comments_test) clf.fit(X_train, y_train) #from sklearn.tree import export_graphviz #export_graphviz(clf, "tree3.dot", ft.get_feature_names()) #tracer() #X_test_s = select.transform(X_test) probs = clf.predict_proba(X_test) pred = clf.predict(X_test) pred_train = clf.predict(X_train) probs_train = clf.predict_proba(X_train) print("auc: %f" % auc_score(y_test, probs[:, 1])) print("auc train: %f" % auc_score(y_train, probs_train[:, 1])) fp_train = np.where(pred_train > y_train)[0] fn_train = np.where(pred_train < y_train)[0] fn_comments_train = comments_train[fn_train] fp_comments_train = comments_train[fp_train] n_bad_train = X_train[:, -22].toarray().ravel() fn_comments_train = np.vstack([fn_train, n_bad_train[fn_train], probs_train[fn_train][:, 1], fn_comments_train]).T fp_comments_train = np.vstack([fp_train, n_bad_train[fp_train], probs_train[fp_train][:, 1], fp_comments_train]).T fp = np.where(pred > y_test)[0] fn = np.where(pred < y_test)[0] fn_comments = comments_test[fn] fp_comments = comments_test[fp] n_bad = X_test[:, -2].toarray().ravel() fn_comments = np.vstack([fn, n_bad[fn], probs[fn][:, 1], fn_comments]).T fp_comments = np.vstack([fp, n_bad[fp], probs[fp][:, 1], fp_comments]).T # visualize important features #important = np.abs(clf.coef_.ravel()) > 0.001 #coef_ = select.inverse_transform(clf.coef_) coef_ = clf.coef_ important = np.argsort(np.abs(coef_.ravel()))[-100:] feature_names = ft.get_feature_names() f_imp = feature_names[important] coef = coef_.ravel()[important] inds = np.argsort(coef) f_imp = f_imp[inds] coef = coef[inds] plt.plot(coef, label="l1") ax = plt.gca() ax.set_xticks(np.arange(len(coef))) labels = ax.set_xticklabels(f_imp) for label in labels: label.set_rotation(90) plt.savefig("ana.png", bbox_inches="tight") plt.show() def about(comment_num): print(comments_test[comment_num]) inds = np.where(X_test[comment_num].toarray())[1] coef_com = coef_.ravel()[inds] feat_entries = X_test[comment_num, inds].toarray().ravel() sorting = np.argsort(coef_com * feat_entries) blub = np.vstack([feature_names[inds][sorting], feat_entries[sorting], coef_com[sorting]]).T print(blub) tracer()