Esempio n. 1
0
def explore_features():
    comments, labels = load_extended_data()
    ft = TextFeatureTransformer()
    features, flat_words_lower, filtered_words, comments_filtered = \
            ft._preprocess(comments)
    asdf = [" ".join(w) for w in filtered_words]
    np.savetxt("filtered.txt", asdf, fmt="%s")
Esempio n. 2
0
def explore_features():
    comments, labels = load_extended_data()
    ft = TextFeatureTransformer()
    features, flat_words_lower, filtered_words, comments_filtered = \
            ft._preprocess(comments)
    asdf = [" ".join(w) for w in filtered_words]
    np.savetxt("filtered.txt", asdf, fmt="%s")
Esempio n. 3
0
def analyze_output():
    comments, labels = load_data()
    y_train, y_test, comments_train, comments_test = \
            train_test_split(labels, comments, random_state=1)
    #from sklearn.tree import DecisionTreeClassifier
    #bad = BadWordCounter()
    #custom = bad.transform(comments_train)

    clf = LogisticRegression(tol=1e-8, penalty='l2', C=1.5)
    #clf = DecisionTreeClassifier(compute_importances=True,min_samples_leaf=10)
    ft = TextFeatureTransformer().fit(comments_train, y_train)
    X_train = ft.transform(comments_train)
    #select = SelectPercentile(score_func=chi2, percentile=7)
    #X_train_s = select.fit_transform(X_train, y_train)
    X_test = ft.transform(comments_test)
    clf.fit(X_train, y_train)
    #from sklearn.tree import export_graphviz
    #export_graphviz(clf, "tree3.dot", ft.get_feature_names())
    #tracer()
    #X_test_s = select.transform(X_test)
    probs = clf.predict_proba(X_test)
    pred = clf.predict(X_test)
    pred_train = clf.predict(X_train)
    probs_train = clf.predict_proba(X_train)
    print("auc: %f" % auc_score(y_test, probs[:, 1]))
    print("auc train: %f" % auc_score(y_train, probs_train[:, 1]))

    fp_train = np.where(pred_train > y_train)[0]
    fn_train = np.where(pred_train < y_train)[0]
    fn_comments_train = comments_train[fn_train]
    fp_comments_train = comments_train[fp_train]
    n_bad_train = X_train[:, -22].toarray().ravel()
    fn_comments_train = np.vstack([
        fn_train, n_bad_train[fn_train], probs_train[fn_train][:, 1],
        fn_comments_train
    ]).T
    fp_comments_train = np.vstack([
        fp_train, n_bad_train[fp_train], probs_train[fp_train][:, 1],
        fp_comments_train
    ]).T

    fp = np.where(pred > y_test)[0]
    fn = np.where(pred < y_test)[0]
    fn_comments = comments_test[fn]
    fp_comments = comments_test[fp]
    n_bad = X_test[:, -2].toarray().ravel()
    fn_comments = np.vstack([fn, n_bad[fn], probs[fn][:, 1], fn_comments]).T
    fp_comments = np.vstack([fp, n_bad[fp], probs[fp][:, 1], fp_comments]).T

    # visualize important features
    #important = np.abs(clf.coef_.ravel()) > 0.001
    #coef_ = select.inverse_transform(clf.coef_)
    coef_ = clf.coef_
    important = np.argsort(np.abs(coef_.ravel()))[-100:]
    feature_names = ft.get_feature_names()
    f_imp = feature_names[important]
    coef = coef_.ravel()[important]
    inds = np.argsort(coef)
    f_imp = f_imp[inds]
    coef = coef[inds]
    plt.plot(coef, label="l1")
    ax = plt.gca()
    ax.set_xticks(np.arange(len(coef)))
    labels = ax.set_xticklabels(f_imp)
    for label in labels:
        label.set_rotation(90)
    plt.savefig("ana.png", bbox_inches="tight")
    plt.show()

    def about(comment_num):
        print(comments_test[comment_num])
        inds = np.where(X_test[comment_num].toarray())[1]
        coef_com = coef_.ravel()[inds]
        feat_entries = X_test[comment_num, inds].toarray().ravel()
        sorting = np.argsort(coef_com * feat_entries)
        blub = np.vstack([
            feature_names[inds][sorting], feat_entries[sorting],
            coef_com[sorting]
        ]).T
        print(blub)

    tracer()
Esempio n. 4
0
def build_nltk_model():
    select = SelectPercentile(score_func=chi2, percentile=36)
    clf = LogisticRegression(tol=1e-8, penalty='l2', C=2)
    ft = TextFeatureTransformer()
    pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])
    return pipeline
Esempio n. 5
0
def analyze_output():
    comments, labels = load_data()
    y_train, y_test, comments_train, comments_test = \
            train_test_split(labels, comments, random_state=1)
    #from sklearn.tree import DecisionTreeClassifier
    #bad = BadWordCounter()
    #custom = bad.transform(comments_train)

    clf = LogisticRegression(tol=1e-8, penalty='l2', C=1.5)
    #clf = DecisionTreeClassifier(compute_importances=True,min_samples_leaf=10)
    ft = TextFeatureTransformer().fit(comments_train, y_train)
    X_train = ft.transform(comments_train)
    #select = SelectPercentile(score_func=chi2, percentile=7)
    #X_train_s = select.fit_transform(X_train, y_train)
    X_test = ft.transform(comments_test)
    clf.fit(X_train, y_train)
    #from sklearn.tree import export_graphviz
    #export_graphviz(clf, "tree3.dot", ft.get_feature_names())
    #tracer()
    #X_test_s = select.transform(X_test)
    probs = clf.predict_proba(X_test)
    pred = clf.predict(X_test)
    pred_train = clf.predict(X_train)
    probs_train = clf.predict_proba(X_train)
    print("auc: %f" % auc_score(y_test, probs[:, 1]))
    print("auc train: %f" % auc_score(y_train, probs_train[:, 1]))

    fp_train = np.where(pred_train > y_train)[0]
    fn_train = np.where(pred_train < y_train)[0]
    fn_comments_train = comments_train[fn_train]
    fp_comments_train = comments_train[fp_train]
    n_bad_train = X_train[:, -22].toarray().ravel()
    fn_comments_train = np.vstack([fn_train, n_bad_train[fn_train],
        probs_train[fn_train][:, 1], fn_comments_train]).T
    fp_comments_train = np.vstack([fp_train, n_bad_train[fp_train],
        probs_train[fp_train][:, 1], fp_comments_train]).T

    fp = np.where(pred > y_test)[0]
    fn = np.where(pred < y_test)[0]
    fn_comments = comments_test[fn]
    fp_comments = comments_test[fp]
    n_bad = X_test[:, -2].toarray().ravel()
    fn_comments = np.vstack([fn, n_bad[fn], probs[fn][:, 1], fn_comments]).T
    fp_comments = np.vstack([fp, n_bad[fp], probs[fp][:, 1], fp_comments]).T

    # visualize important features
    #important = np.abs(clf.coef_.ravel()) > 0.001
    #coef_ = select.inverse_transform(clf.coef_)
    coef_ = clf.coef_
    important = np.argsort(np.abs(coef_.ravel()))[-100:]
    feature_names = ft.get_feature_names()
    f_imp = feature_names[important]
    coef = coef_.ravel()[important]
    inds = np.argsort(coef)
    f_imp = f_imp[inds]
    coef = coef[inds]
    plt.plot(coef, label="l1")
    ax = plt.gca()
    ax.set_xticks(np.arange(len(coef)))
    labels = ax.set_xticklabels(f_imp)
    for label in labels:
        label.set_rotation(90)
    plt.savefig("ana.png", bbox_inches="tight")
    plt.show()

    def about(comment_num):
        print(comments_test[comment_num])
        inds = np.where(X_test[comment_num].toarray())[1]
        coef_com = coef_.ravel()[inds]
        feat_entries = X_test[comment_num, inds].toarray().ravel()
        sorting = np.argsort(coef_com * feat_entries)
        blub = np.vstack([feature_names[inds][sorting], feat_entries[sorting],
            coef_com[sorting]]).T
        print(blub)

    tracer()