Esempio n. 1
0
def posVsRest(X,Y):
    #Y now contains 1 for tweets that were positive
    #and 0 for negative, neutral or irrelevant
    Y= u.tweak_labels(Y,["positive"])
    classes = np.unique(Y)
    for c in classes:
            print("#%s: %i" % (c, sum(Y==c)))
    train_model(Models.create_ngram_model,X,Y,"posVsRest",True)
Esempio n. 2
0
def sentimentAndNoSentiment(X,Y):
    #Y now contains 1 for tweets that were positive or negative
    #and 0 for neutral or irrelevant
    Y= u.tweak_labels(Y,["positive","negative"])
    classes = np.unique(Y)
    for c in classes:
            print("#%s: %i" % (c, sum(Y==c)))
    train_model(Models.get_best_union_model,X,Y,"sentimentVsNoSentiment",True)
Esempio n. 3
0
    best_clf = create_ngram_model(best_params)

    return best_clf


if __name__ == "__main__":
    X_orig, Y_orig = load_sanders_data()
    classes = np.unique(Y_orig)
    for c in classes:
        print("#%s: %i" % (c, sum(Y_orig == c)))

    print("== Pos vs. neg ==")
    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
    X = X_orig[pos_neg]
    Y = Y_orig[pos_neg]
    Y = tweak_labels(Y, ["positive"])
    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

    print("== Pos/neg vs. irrelevant/neutral ==")
    X = X_orig
    Y = tweak_labels(Y_orig, ["positive", "negative"])

    # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs
    # rest", plot=True)
    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

    print("== Pos vs. rest ==")
    X = X_orig
    Y = tweak_labels(Y_orig, ["positive"])
    train_model(get_best_model(), X, Y, name="pos vs rest", plot=True)
Esempio n. 4
0
    best_clf = create_ngram_model(best_params)

    return best_clf

if __name__ == "__main__":
    X_orig, Y_orig = load_sanders_data()
    classes = np.unique(Y_orig)
    for c in classes:
        print "#%s: %i" % (c, sum(Y_orig == c))

    print "== Pos vs. neg =="
    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
    X = X_orig[pos_neg]
    Y = Y_orig[pos_neg]
    Y = tweak_labels(Y, ["positive"])
    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

    print "== Pos/neg vs. irrelevant/neutral =="
    X = X_orig
    Y = tweak_labels(Y_orig, ["positive", "negative"])

    # best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs
    # rest", plot=True)
    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

    print "== Pos vs. rest =="
    X = X_orig
    Y = tweak_labels(Y_orig, ["positive"])
    train_model(get_best_model(), X, Y, name="pos vs rest",
    plot=True)
Esempio n. 5
0
                       clf__alpha=0.01,
                       )
    best_clf = create_ngram_model(best_params)
    return best_clf

if __name__ == "__main__":
    X_orig, Y_orig = load_data()
    classes = np.unique(Y_orig)
    for c in classes:
        print "#%s: %i" % (c, sum(Y_orig == c))

    print "== Pos vs. neg =="
    pos_neg = np.logical_or(Y_orig == "Pos", Y_orig == "Neg")
    X = X_orig[pos_neg]
    Y = Y_orig[pos_neg]
    Y = tweak_labels(Y, ["Pos"])
    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
    
    print "== Pos/neg vs. irrelevant/neutral =="
    X = X_orig
    Y = tweak_labels(Y_orig, ["Pos", "Neg"])

    #best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)
    train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)

    print "== Pos vs. rest =="
    X = X_orig
    Y = tweak_labels(Y_orig, ["Pos"])
    train_model(get_best_model(), X, Y, name="pos vs rest",
                plot=True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB

#Naive Bayes proves to be quite robust to irrelevant features, learns fast,
#doesn't need lots of storage. So why Naive? Well features are required to
#examined independent of one another.

#
if __name__ == "__main__":
    X_orig, Y_orig = load_sanders_data()
    classes = np.unique(Y_orig)
    for c in classes:
        print "#%s: %i" % (c, sum(Y_orig == c))

    print "== Pos vs. neg =="
    pos_neg = np.logical or(Y_orig == "positive", Y_orig == "negative")
    X = X_orig[pos_neg]
    Y = Y_orig[pos_neg]
    Y = tweak_labels(Y, ["positive"])

def create_ngram_model():
    tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
                                   analyzer="word",
                                   binary=False)
    clf = MultinomialNB()
    pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
    return pipeline
Esempio n. 7
0
    for idx in xrange(len(X_wrong)):
        print "clf.predict('%s')=%i instead of %i" %\
            (X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])


if __name__ == "__main__":
    X_orig, Y_orig = load_sanders_data()
    classes = np.unique(Y_orig)
    for c in classes:
        print "#%s: %i" % (c, sum(Y_orig == c))

    print "== pos vs. neg =="
    pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
    x = X_orig[pos_neg]
    y = Y_orig[pos_neg]
    y = tweak_labels(y, ["positive"])

    train_model(create_ngram_model, x, y, name="pos vs neg", plot="true")

    print "== pos/neg vs. irrelevant/neutral =="
    x = X_orig
    y = tweak_labels(Y_orig, ["positive", "negative"])
    train_model(create_ngram_model, x, y, name="sent vs rest", plot=True)

    print "== pos vs. rest =="
    x = X_orig
    y = tweak_labels(Y_orig, ["positive"])
    train_model(create_ngram_model, x, y, name="pos vs rest", plot="true")

    print "== neg vs. rest =="
    x = X_orig