def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets): """ Processes a list of tweets, for each: 1. Identifies the target 2. If the message contains a target of interest infer the polarity targetsFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro" tweets -> list of tweets """ print "hell yeah!" print "Loading resources...\nTargets: " + targetsFile targets = Utils.getFromCache(PERSONS_CACHE) if targets != None: print "Target list found on cache!" else: targets = Persons.loadPoliticians(targetsFile) Utils.putInCache(targets, PERSONS_CACHE) print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = Utils.getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens) Utils.putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHelper(multiWordsFile) multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets)) multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens)) Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Resources loaded! Starting analysis..." targetDetector = TargetDetector(targets) #TODO:Estes senhores já não precisam de receber os targets naive = Naive(sentiTokens) rules = Rules(None,sentiTokens) analyzedTweets = [] rejectedTweets = [] for tweet in tweets: t0 = datetime.now() tweetsWithTarget = targetDetector.inferTarget(tweet) if tweetsWithTarget != None : #a tweet can have multiple targets (in that case the message is replicated) for tweet in tweetsWithTarget: #try to classify with rules... analyzedTweet = rules.inferPolarity(tweet,False) #if not possible use the naive classifier if analyzedTweet.polarity == 0: analyzedTweet = naive.inferPolarity(analyzedTweet,False) #If the polarity is still 0 it can mean: #1) The sum of the polarities of the sentiTokens is 0, #2) There was no evidence usable to assess the sentiment if analyzedTweet.polarity == 0: regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)' #Try to find if there are any evidence of matched sentiTokens match = re.search(regex,analyzedTweet.metadata).group(2) if debug: print "match: ", match if len(match.strip(' ')) == 0: rejectedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) else: analyzedTweets.append(analyzedTweet) t1 = datetime.now() print tweet.id + " ("+ str(t1-t0) + ")" logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv") return analyzedTweets
def processComments(sentiTokensFile, exceptSentiTokens, multiWordsFile, messages): """ Processes a list of tweets, for each: 1. Identifies the target 2. If the message contains a target of interest infer the polarity targetsFile -> path to the politicians list file sentiTokensFile -> path to the sentiTokens list file exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without causing ambiguity for ex: más -> mas multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro" tweets -> list of tweets """ if debug: print "DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG \n\n" print "Loading resources..." print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens sentiTokens = Utils.getFromCache(SENTI_CACHE) if sentiTokens != None: print "SentiTokens found on cache!" else: sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile, exceptSentiTokens) Utils.putInCache(sentiTokens, SENTI_CACHE) print "Multiword Tokenizer: " + multiWordsFile multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE) if multiWordTokenizer != None: print "Multiword Tokenizer found on cache" else: multiWordTokenizer = MultiWordHelper(multiWordsFile) multiWordTokenizer.addMultiWords( SentiTokens.getMultiWords(sentiTokens)) Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE) print "Resources loaded! Starting analysis..." naive = Naive(sentiTokens) #rules = Rules(None,sentiTokens) rows = 0 positiveTokens = {} negativeTokens = {} for message in messages: rows += 1 t0 = datetime.now() tokens = naive.tokenizeSentiTokens(message, True) for token in tokens[0]: if token not in positiveTokens: positiveTokens[token] = 1 else: positiveTokens[token] += 1 for token in tokens[1]: if token not in negativeTokens: negativeTokens[token] = 1 else: negativeTokens[token] += 1 if rows % 1000 == 0 and rows != 0: writeResults(positiveTokens, "./positive" + str(rows) + ".csv") writeResults(negativeTokens, "./negative" + str(rows) + ".csv") if debug: t1 = datetime.now() print "Time: " + str(t1 - t0) print message.sentence print "positive: ", tokens[0] print "negative: ", tokens[1] print "\n------------------\n" writeResults(positiveTokens, "./positive.csv") writeResults(negativeTokens, "./negative.csv") print "done!"