Ejemplo n.º 1
0
def perform_grid_search_on_featureset_SA_and_PA():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_subjectivity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("SA", None)
    clf.grid_search_on_text_features(file_postfix="subjectivity")

    train_tweets, train_targets, test_tweets, test_targets, train_sentimentvalues, test_sentimentvalues = utils.make_polarity_train_and_test_and_targets(
        tweets, sentimentvalues
    )

    clf = SVM(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = NB(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
    clf = ME(train_tweets, train_targets, None)
    clf.set_feature_set("PA", None)
    clf.grid_search_on_text_features(file_postfix="polarity")
Ejemplo n.º 2
0
def train_and_test_subjectivity_and_polarity():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    #    train_subjectivity_and_test_on_feature_set(tweets, 'SA', sentimentvalues)
    train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues)
    train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues)
Ejemplo n.º 3
0
def train_and_test_dataset_increase():
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)
    accuracy_data = {
        "NB(SA)": [],
        "NB(SB)": [],
        "NB(SC)": [],
        "SVM(SA)": [],
        "SVM(SB)": [],
        "SVM(SC)": [],
        "MaxEnt(SA)": [],
        "MaxEnt(SB)": [],
        "MaxEnt(SC)": [],
        "NB(PA)": [],
        "NB(PB)": [],
        "NB(PC)": [],
        "SVM(PA)": [],
        "SVM(PB)": [],
        "SVM(PC)": [],
        "MaxEnt(PA)": [],
        "MaxEnt(PB)": [],
        "MaxEnt(PC)": [],
    }
    f1_data = {
        "NB(SA)": [],
        "NB(SB)": [],
        "NB(SC)": [],
        "SVM(SA)": [],
        "SVM(SB)": [],
        "SVM(SC)": [],
        "MaxEnt(SA)": [],
        "MaxEnt(SB)": [],
        "MaxEnt(SC)": [],
        "NB(PA)": [],
        "NB(PB)": [],
        "NB(PC)": [],
        "SVM(PA)": [],
        "SVM(PB)": [],
        "SVM(PC)": [],
        "MaxEnt(PA)": [],
        "MaxEnt(PB)": [],
        "MaxEnt(PC)": [],
    }
    for i in range(5, 101, 5):
        print "=============================DATAPOINT NR. ", i, "========================================"
        data = train_subjectivity_and_test_on_feature_set(tweets, "SA", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(SA)"].append(data["Naive Bayes"][0])
        f1_data["NB(SA)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(SA)"].append(data["SVM"][0])
        f1_data["SVM(SA)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(SA)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(SA)"].append(data["Maximum Entropy"][3])

        data = train_subjectivity_and_test_on_feature_set(tweets, "SB", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(SB)"].append(data["Naive Bayes"][0])
        f1_data["NB(SB)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(SB)"].append(data["SVM"][0])
        f1_data["SVM(SB)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(SB)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(SB)"].append(data["Maximum Entropy"][3])

        data = train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(SC)"].append(data["Naive Bayes"][0])
        f1_data["NB(SC)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(SC)"].append(data["SVM"][0])
        f1_data["SVM(SC)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(SC)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(SC)"].append(data["Maximum Entropy"][3])

        data = train_polarity_and_test_on_feature_set(tweets, "PA", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(PA)"].append(data["Naive Bayes"][0])
        f1_data["NB(PA)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(PA)"].append(data["SVM"][0])
        f1_data["SVM(PA)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(PA)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(PA)"].append(data["Maximum Entropy"][3])

        data = train_polarity_and_test_on_feature_set(tweets, "PB", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(PB)"].append(data["Naive Bayes"][0])
        f1_data["NB(PB)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(PB)"].append(data["SVM"][0])
        f1_data["SVM(PB)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(PB)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(PB)"].append(data["Maximum Entropy"][3])

        data = train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues, reduce_dataset=i * 0.01)
        print "DATA -- ", data
        accuracy_data["NB(PC)"].append(data["Naive Bayes"][0])
        f1_data["NB(PC)"].append(data["Naive Bayes"][3])
        accuracy_data["SVM(PC)"].append(data["SVM"][0])
        f1_data["SVM(PC)"].append(data["SVM"][3])
        accuracy_data["MaxEnt(PC)"].append(data["Maximum Entropy"][0])
        f1_data["MaxEnt(PC)"].append(data["Maximum Entropy"][3])
        out = open("incremental_acc" + str(i), "wb")
        pickle.dump(accuracy_data, out)
        out = open("incremental_f1" + str(i), "wb")
        pickle.dump(f1_data, out)
    plotting.plot_temporal_sentiment(accuracy_data, filename="incremental_accuracy")
    plotting.plot_temporal_sentiment(f1_data, filename="incremental_f1")
Ejemplo n.º 4
0
    pickle.dump(data, open("topically_aggregated_polarity", "wb"))


def preprocess_temporal_dataset():
    tweetlines = utils.get_dataset(utils.complete_datasets[3])
    tweets = []
    for line in tweetlines:
        if len(line) > 1:
            tweets.append(tweet.to_tweet(line))
    tweets = preprocessing.preprocess_tweets(tweets)
    sentiments = lexicon.perform_google_sentiment_lexicon_lookup(tweets)
    pickle.dump(sentiments, open("temporal_sentiments", "wb"))
    pickle.dump(tweets, open("temporal_tweets2", "wb"))


if __name__ == "__main__":
    datasetnr = 3
    tweets = utils.get_pickles(datasetnr)
    sentimentvalues = feat_utils.get_sentiment_values(datasetnr)
    tweets = preprocessing.remove_link_classes(tweets)
    tweets = preprocessing.lower_case(tweets)
    tweets = preprocessing.remove_specialchars_round2(tweets)

    train_subjectivity_and_test_on_feature_set(tweets, "SA", datasetnr)
    train_subjectivity_and_test_on_feature_set(tweets, "SB", datasetnr)
    train_subjectivity_and_test_on_feature_set(tweets, "SC", sentimentvalues)

    train_polarity_and_test_on_feature_set(tweets, "PA", datasetnr)
    train_polarity_and_test_on_feature_set(tweets, "PB", datasetnr)
    train_polarity_and_test_on_feature_set(tweets, "PC", sentimentvalues)