#python script to clean incoming Sanders data-- gets rid of first three words and quotation marks, only includes if not irrelevant import svc_emotions tweets = svc_emotions.file_to_array("newercorpus.txt") #create outfile to put new stuff into outfile_name = "quoteless.txt" outfile = open(outfile_name, 'w') for tweet in tweets: #part that first gets rid of the tabs and then parses based on spaces and removes irrelevant tweets # newList = tweet.split(" ") # newString = "" # for smallString in newList[4:]: # newString = newString + " " + smallString # if newList[2] != "irrelevant": # outfile.write("\n" + newString) #part that takes the quotes away newTweet = tweet[4:] newTweet = newTweet[:-3] outfile.write("\n" + newTweet) outfile.close()
# coding=utf-8 import svc_emotions import scipy import numpy from sklearn.pipeline import FeatureUnion # list of emotions to get feature lists for emotion_list = ['anger', 'disgust', 'fear', 'joy', 'love', 'sadness', 'surprise'] # load lists of tweets and corresponding labels into arrays tweets = svc_emotions.file_to_array("tweets_full.txt") labels = svc_emotions.file_to_array("labels_full.txt")
import tweet_processing import svc_emotions from nltk.tokenize import word_tokenize def output_list_from_csv(emotion): filename = emotion + '.csv' with open(filename, 'rb') as f: reader = csv.reader(f) synonym_list = list(reader)[0] return synonym_list tweets_legacy = tweet_processing.clean("tweets_full.txt") labels_legacy = svc_emotions.file_to_array("labels_full.txt") emotion_list = [ 'anger', 'disgust', 'fear', 'joy', 'love', 'sadness', 'surprise', ] all_processed_tweets = [] count_TP = { 'anger': 0,