def tokenize(l): tokens = [] for item in l: tokens.append(twokenize.simpleTokenize(item)) return tokens
def tokenize(l): tokens=[] for item in l: tokens.append(twokenize.simpleTokenize(item)) return tokens
def nltkTags(messages): data = [] for message in messages: #tokenization with ntlk tokenizer #tokens = word_tokenize(message) #tokenization with Noah's Ark Simple tokenizer tokens = twokenize.simpleTokenize(message) tags = pos_tag(tokens) tags = [x[1] for x in tags] data.append(tags) return data
#NRC Lexicon - 5 different versions nrc1 = NRCLexicon.NRCLexicon(0) nrc2 = NRCLexicon.NRCLexicon(1) nrc3 = NRCLexicon.NRCLexicon(2) nrc4 = NRCLexicon.NRCLexicon(3) nrc5 = NRCLexicon.NRCLexicon(4) #MPQA Lexicon mpqa = MPQALexicon.MPQALexicon() #SentiWordNet Lexicon swn = SentiWordNetLexicon.SentiWordNetLexicon() #compute sentiment score for all messages for i in range(0,len(messages)): print i #tokens of message, used in counting sentiment score tokens = twokenize.simpleTokenize(messages[i]) #compute pos tags of message pos_tags = tags[i] #update scores socal_scores.append(socal.score(tokens)) minqinghu_scores.append(minqinghu.score(tokens)) afinn_scores.append(afinn.score(messages[i])) #Afinn : input message instead of message's tokens nrc1_scores.append(nrc1.score(tokens)) nrc2_scores.append(nrc2.score(tokens)) nrc3_scores.append(nrc3.score(tokens)) nrc4_scores.append(nrc4.score(tokens)) nrc5_scores.append(nrc5.score(tokens)) mpqa_scores.append(mpqa.score(tokens)) swn_scores.append(swn.score(tokens,pos_tags)) #SentiWordNet : input messages's tokens and pos tags
#average length of messages average_length_all = sum([len(x) for x in messages])/len(messages) average_length_neutral = sum([len(x) for x in neutral_messages])/len(neutral_messages) average_length_positive = sum([len(x) for x in positive_messages])/len(positive_messages) average_length_negative = sum([len(x) for x in negative_messages])/len(negative_messages) #get the text of messages #tokenization with ntlk tokenizer #text_all = Text(word_tokenize(str(messages))) #text_neutral = Text(word_tokenize(str(neutral_messages))) #text_positive = Text(word_tokenize(str(positive_messages))) #text_negative = Text(word_tokenize(str(negative_messages))) #tokenization with Noah's Ark Simple tokenizer text_all = Text(twokenize.simpleTokenize(str(messages))) text_neutral = Text(twokenize.simpleTokenize(str(neutral_messages))) text_positive = Text(twokenize.simpleTokenize(str(positive_messages))) text_negative = Text(twokenize.simpleTokenize(str(negative_messages))) #remove stopwords if remove_stopwords: text_all = removeStopwords(text_all, number_of_stopwords,"All") text_neutral = removeStopwords(text_neutral, number_of_stopwords,"Neutral") text_positive = removeStopwords(text_positive, number_of_stopwords,"Positive") text_negative = removeStopwords(text_negative, number_of_stopwords,"Negative") #print "***Top "+str(number_of_stopwords)+" stopwords removed***" print "Stopwords removed" #total words of messages
from tsvfiles import tsvreader from nltk import word_tokenize from tokenizers import twokenize import matplotlib.pyplot as plt #read labels and messages from dataset dataset = "datasets/tweets#2015.tsv" labels, messages = tsvreader.opentsv(dataset) total = 0 noahs_total = 0 for message in messages: nlkt_tok = word_tokenize(message) noahs_simple = twokenize.simpleTokenize(message) noahs_nohmlt = twokenize.tokenize(message) noahs_raw = twokenize.tokenizeRawTweetText(message) #check when ntlk and noah's ark tokenizer "agree" if(nlkt_tok==noahs_simple or nlkt_tok==noahs_nohmlt or nlkt_tok==noahs_raw): total+=1 #check when the 3 noah's ark tokenizers "agree" if(noahs_simple==noahs_nohmlt and noahs_simple==noahs_raw and noahs_nohmlt==noahs_raw): noahs_total+=1 #plot pie slices = [total,len(messages)-total] fig = plt.figure(figsize=[10,10]) ax = fig.add_subplot(111) cmap = plt.cm.prism