def createTestSet(userName): print('Creating test vector set') file = open('SVM/UserData/%s_latest_tweets' % userName, 'rb') tweets = pickle.load(file) data = [] for i in range(0, len(tweets)): tweet = tweets[i] tweet = tweet.lower() text = w(tweet) text = negation(text) vector = [0, 0, 0, 0, 0, 0, 0, 0] index = -1 for emotion in emotions: file = open('SVM/EmotionLexicons/%s.txt' % emotion, 'r') vocab = file.read().split() index += 1 for word in text: if word in vocab: vector[index] += 1 data.append(vector) file = open('SVM/UserData/%s_test_vectors' % userName, 'wb') pickle.dump(data, file) print('Created test vector set\n')
def preprocessTweets(tweet): # 1 Remove HTML Tags soup = BeautifulSoup(tweet, 'html.parser') cleaned_tweet = soup.get_text() # 2 Remove appostrophes appostrophes= {"'s" : " is", "'re": " are", "n't" : "not", "'d" : " had" , "'m":" am", "'ve":" have", "lol" : "laugh out loud", "luv" : "love"} slist = w(cleaned_tweet) newsen=[] for word in slist: for candid in appostrophes: if candid in word: word=word.replace(candid,appostrophes[candid]) newsen.append(word) cleaned_tweet=" ".join(newsen) # 3 Split attached words ans = "" for a in re.findall('[A-Z][^A-Z]*',cleaned_tweet): ans += a.strip()+' ' cleaned_tweet = ans # 4 Standardizing words like i am happpppyyy to i am happy temp =''.join(''.join(s)[:2] for _, s in itertools.groupby(cleaned_tweet)) cleaned_tweet = temp # 5 Remove URL cleaned_tweet= re.sub(r"http\S+.*", " ",cleaned_tweet) cleaned_tweet = re.sub(r"http.*", " ", cleaned_tweet) # 6 Slang Lookup # sentence_list = w(cleaned_tweet) # # # make a place where we can build our new sentence # new_sentence = [] # # look through each word # for word in sentence_list: # # look for each candidate # for candidate_replacement in appostrophes: # # if our candidate is there in the word # if candidate_replacement in word: # # replace it # word = word.replace(candidate_replacement, appostrophes[candidate_replacement]) # # # and pop it onto a new list # new_sentence.append(word) # # cleaned_tweet = " ".join(new_sentence) return cleaned_tweet
# file = pd.read_csv('new_dataset.csv') # tweets = file.values.tolist() # # with open('./training_tweets','wb') as f: # pickle.dump(tweets,f) # f.close() f = open('./training_tweets', 'rb') tweets = pickle.load(f) f.close() data = [] for i in range(0, len(tweets)): raw_tweet = tweets[i][1] raw_tweet = w(raw_tweet.lower()) vector = [0, 0, 0, 0, 0, 0, 0, 0] index = -1 for emotion in emotions: file = open('EmotionLexicons/%s.txt' % emotion, 'r') vocab = file.read().split() index += 1 for word in raw_tweet: if word in vocab: vector[index] += 1 data.append(vector)