def createTestSet(userName):

    print('Creating test vector set')

    file = open('SVM/UserData/%s_latest_tweets' % userName, 'rb')
    tweets = pickle.load(file)

    data = []

    for i in range(0, len(tweets)):
        tweet = tweets[i]
        tweet = tweet.lower()
        text = w(tweet)
        text = negation(text)

        vector = [0, 0, 0, 0, 0, 0, 0, 0]
        index = -1

        for emotion in emotions:
            file = open('SVM/EmotionLexicons/%s.txt' % emotion, 'r')
            vocab = file.read().split()

            index += 1

            for word in text:
                if word in vocab:
                    vector[index] += 1

        data.append(vector)

    file = open('SVM/UserData/%s_test_vectors' % userName, 'wb')
    pickle.dump(data, file)
    print('Created test vector set\n')
Beispiel #2
0
def preprocessTweets(tweet):

    # 1 Remove HTML Tags
    soup = BeautifulSoup(tweet, 'html.parser')
    cleaned_tweet = soup.get_text()

    # 2 Remove appostrophes
    appostrophes= {"'s" : " is", "'re": " are", "n't" : "not", "'d" : " had" , "'m":" am", "'ve":" have", "lol" : "laugh out loud", "luv" : "love"}

    slist = w(cleaned_tweet)
    newsen=[]

    for word in slist:
        for candid in appostrophes:
            if candid in word:
                word=word.replace(candid,appostrophes[candid])
        newsen.append(word)

    cleaned_tweet=" ".join(newsen)

    # 3 Split attached words
    ans = ""
    for a in re.findall('[A-Z][^A-Z]*',cleaned_tweet):
        ans += a.strip()+' '
    cleaned_tweet = ans

    # 4 Standardizing words like i am happpppyyy to i am happy
    temp =''.join(''.join(s)[:2] for _, s in itertools.groupby(cleaned_tweet))
    cleaned_tweet = temp

    # 5 Remove URL
    cleaned_tweet= re.sub(r"http\S+.*", " ",cleaned_tweet)
    cleaned_tweet = re.sub(r"http.*", " ", cleaned_tweet)

    # 6 Slang Lookup

    # sentence_list = w(cleaned_tweet)
    #
    # # make a place where we can build our new sentence
    # new_sentence = []

    # # look through each word
    # for word in sentence_list:
    #     # look for each candidate
    #     for candidate_replacement in appostrophes:
    #         # if our candidate is there in the word
    #         if candidate_replacement in word:
    #             # replace it
    #             word = word.replace(candidate_replacement, appostrophes[candidate_replacement])
    #
    #     # and pop it onto a new list
    #     new_sentence.append(word)
    #
    # cleaned_tweet = " ".join(new_sentence)

    return cleaned_tweet
# file = pd.read_csv('new_dataset.csv')
# tweets = file.values.tolist()
#
# with open('./training_tweets','wb') as f:
#     pickle.dump(tweets,f)
#     f.close()

f = open('./training_tweets', 'rb')
tweets = pickle.load(f)
f.close()

data = []

for i in range(0, len(tweets)):
    raw_tweet = tweets[i][1]
    raw_tweet = w(raw_tweet.lower())

    vector = [0, 0, 0, 0, 0, 0, 0, 0]
    index = -1

    for emotion in emotions:
        file = open('EmotionLexicons/%s.txt' % emotion, 'r')
        vocab = file.read().split()

        index += 1

        for word in raw_tweet:
            if word in vocab:
                vector[index] += 1

    data.append(vector)