Exemple #1
0
#python script to clean incoming Sanders data-- gets rid of first three words and quotation marks, only includes if not irrelevant
import svc_emotions

tweets = svc_emotions.file_to_array("newercorpus.txt")

#create outfile to put new stuff into
outfile_name = "quoteless.txt"
outfile = open(outfile_name, 'w')

for tweet in tweets:
    #part that first gets rid of the tabs and then parses based on spaces and removes irrelevant tweets
    # newList = tweet.split(" ")
    # newString = ""
    # for smallString in newList[4:]:
    # 	newString = newString + " " + smallString
    # if newList[2] != "irrelevant":
    # 	outfile.write("\n" + newString)

    #part that takes the quotes away
    newTweet = tweet[4:]
    newTweet = newTweet[:-3]

    outfile.write("\n" + newTweet)

outfile.close()
# coding=utf-8
import svc_emotions
import scipy
import numpy
from sklearn.pipeline import FeatureUnion

# list of emotions to get feature lists for
emotion_list = ['anger', 'disgust', 'fear', 'joy', 'love', 'sadness', 'surprise']

# load lists of tweets and corresponding labels into arrays
tweets = svc_emotions.file_to_array("tweets_full.txt")
labels = svc_emotions.file_to_array("labels_full.txt")
import tweet_processing
import svc_emotions
from nltk.tokenize import word_tokenize


def output_list_from_csv(emotion):
    filename = emotion + '.csv'

    with open(filename, 'rb') as f:
        reader = csv.reader(f)
        synonym_list = list(reader)[0]
    return synonym_list


tweets_legacy = tweet_processing.clean("tweets_full.txt")
labels_legacy = svc_emotions.file_to_array("labels_full.txt")

emotion_list = [
    'anger',
    'disgust',
    'fear',
    'joy',
    'love',
    'sadness',
    'surprise',
]

all_processed_tweets = []

count_TP = {
    'anger': 0,