Beispiel #1
0
def tokenize(l):
    tokens = []

    for item in l:
        tokens.append(twokenize.simpleTokenize(item))

    return tokens
def tokenize(l):
    tokens=[]

    for item in l:
        tokens.append(twokenize.simpleTokenize(item))

    return tokens
def nltkTags(messages):
    data = []
    for message in messages:

        #tokenization with ntlk tokenizer
        #tokens = word_tokenize(message)

        #tokenization with Noah's Ark Simple tokenizer
        tokens = twokenize.simpleTokenize(message)
        tags = pos_tag(tokens)
        tags = [x[1] for x in tags]

        data.append(tags)

    return data
#NRC Lexicon - 5 different versions
nrc1 = NRCLexicon.NRCLexicon(0)
nrc2 = NRCLexicon.NRCLexicon(1)
nrc3 = NRCLexicon.NRCLexicon(2)
nrc4 = NRCLexicon.NRCLexicon(3)
nrc5 = NRCLexicon.NRCLexicon(4)
#MPQA Lexicon
mpqa = MPQALexicon.MPQALexicon()
#SentiWordNet Lexicon
swn = SentiWordNetLexicon.SentiWordNetLexicon()

#compute sentiment score for all messages
for i in range(0,len(messages)):
    print i
    #tokens of message, used in counting sentiment score
    tokens = twokenize.simpleTokenize(messages[i])

    #compute pos tags of message
    pos_tags = tags[i]

    #update scores
    socal_scores.append(socal.score(tokens))
    minqinghu_scores.append(minqinghu.score(tokens))
    afinn_scores.append(afinn.score(messages[i]))   #Afinn : input message instead of message's tokens
    nrc1_scores.append(nrc1.score(tokens))
    nrc2_scores.append(nrc2.score(tokens))
    nrc3_scores.append(nrc3.score(tokens))
    nrc4_scores.append(nrc4.score(tokens))
    nrc5_scores.append(nrc5.score(tokens))
    mpqa_scores.append(mpqa.score(tokens))
    swn_scores.append(swn.score(tokens,pos_tags))   #SentiWordNet : input messages's tokens and pos tags
#average length of messages
average_length_all = sum([len(x) for x in messages])/len(messages)
average_length_neutral = sum([len(x) for x in neutral_messages])/len(neutral_messages)
average_length_positive = sum([len(x) for x in positive_messages])/len(positive_messages)
average_length_negative = sum([len(x) for x in negative_messages])/len(negative_messages)

#get the text of messages

#tokenization with ntlk tokenizer
#text_all = Text(word_tokenize(str(messages)))
#text_neutral = Text(word_tokenize(str(neutral_messages)))
#text_positive = Text(word_tokenize(str(positive_messages)))
#text_negative = Text(word_tokenize(str(negative_messages)))

#tokenization with Noah's Ark Simple tokenizer
text_all = Text(twokenize.simpleTokenize(str(messages)))
text_neutral = Text(twokenize.simpleTokenize(str(neutral_messages)))
text_positive = Text(twokenize.simpleTokenize(str(positive_messages)))
text_negative = Text(twokenize.simpleTokenize(str(negative_messages)))

#remove stopwords
if remove_stopwords:
    text_all = removeStopwords(text_all, number_of_stopwords,"All")
    text_neutral = removeStopwords(text_neutral, number_of_stopwords,"Neutral")
    text_positive = removeStopwords(text_positive, number_of_stopwords,"Positive")
    text_negative = removeStopwords(text_negative, number_of_stopwords,"Negative")
    
    #print "***Top "+str(number_of_stopwords)+" stopwords removed***"
    print "Stopwords removed"

#total words of messages
from tsvfiles import tsvreader
from nltk import word_tokenize
from tokenizers import twokenize
import matplotlib.pyplot as plt

#read labels and messages from dataset
dataset = "datasets/tweets#2015.tsv"
labels, messages = tsvreader.opentsv(dataset)

total = 0
noahs_total = 0

for message in messages:
    nlkt_tok = word_tokenize(message)
    noahs_simple = twokenize.simpleTokenize(message)
    noahs_nohmlt = twokenize.tokenize(message)
    noahs_raw = twokenize.tokenizeRawTweetText(message)

    #check when ntlk and noah's ark tokenizer "agree"
    if(nlkt_tok==noahs_simple or nlkt_tok==noahs_nohmlt or nlkt_tok==noahs_raw):
        total+=1

    #check when the 3 noah's ark tokenizers "agree"
    if(noahs_simple==noahs_nohmlt and noahs_simple==noahs_raw and noahs_nohmlt==noahs_raw):
        noahs_total+=1
            
#plot pie
slices = [total,len(messages)-total]
fig = plt.figure(figsize=[10,10])
ax = fig.add_subplot(111)
cmap = plt.cm.prism