Ejemplo n.º 1
0
def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets, for each:
        1. Identifies the target
        2. If the message contains a target of interest infer the polarity
        
        targetsFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
        multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro"
         tweets -> list of tweets
    """
    
    print "hell yeah!"
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = Utils.getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        Utils.putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = Utils.getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        Utils.putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHelper(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print "Resources loaded! Starting analysis..."
    
    
    targetDetector = TargetDetector(targets)
    #TODO:Estes senhores já não precisam de receber os targets
    naive = Naive(sentiTokens)
    rules = Rules(None,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in tweets:
        
        t0 = datetime.now()
        
        tweetsWithTarget = targetDetector.inferTarget(tweet)
        
        if tweetsWithTarget != None :
            
            #a tweet can have multiple targets (in that case the message is replicated)
            for tweet in tweetsWithTarget:       
        
                #try to classify with rules...
                analyzedTweet = rules.inferPolarity(tweet,False)
                
                #if not possible use the naive classifier
                if analyzedTweet.polarity == 0:
                    analyzedTweet = naive.inferPolarity(analyzedTweet,False)
                
                #If the polarity is still 0 it can mean:
                #1) The sum of the polarities of the sentiTokens is 0,
                #2) There was no evidence usable to assess the sentiment                
                if analyzedTweet.polarity == 0:
                    
                    regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
                    
                    #Try to find if there are any evidence of matched sentiTokens
                    match = re.search(regex,analyzedTweet.metadata).group(2)
                    
                    if debug:
                        print "match: ", match
                    
                    if len(match.strip(' ')) == 0:
        
                        rejectedTweets.append(analyzedTweet)
                    else:
                        analyzedTweets.append(analyzedTweet)
                else:
                    analyzedTweets.append(analyzedTweet)
                
                t1 = datetime.now()
            
                print tweet.id + " ("+ str(t1-t0) + ")"
            
    logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets    
def processComments(sentiTokensFile, exceptSentiTokens, multiWordsFile,
                    messages):
    """ 
        Processes a list of tweets, for each:
        1. Identifies the target
        2. If the message contains a target of interest infer the polarity
        
        targetsFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
        multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro"
         tweets -> list of tweets
    """

    if debug:
        print "DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG \n\n"

    print "Loading resources..."

    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens

    sentiTokens = Utils.getFromCache(SENTI_CACHE)

    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                                  exceptSentiTokens)
        Utils.putInCache(sentiTokens, SENTI_CACHE)

    print "Multiword Tokenizer: " + multiWordsFile

    multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE)

    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHelper(multiWordsFile)
        multiWordTokenizer.addMultiWords(
            SentiTokens.getMultiWords(sentiTokens))
        Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE)

    print "Resources loaded! Starting analysis..."

    naive = Naive(sentiTokens)
    #rules = Rules(None,sentiTokens)
    rows = 0

    positiveTokens = {}
    negativeTokens = {}

    for message in messages:

        rows += 1

        t0 = datetime.now()

        tokens = naive.tokenizeSentiTokens(message, True)

        for token in tokens[0]:

            if token not in positiveTokens:
                positiveTokens[token] = 1
            else:
                positiveTokens[token] += 1

        for token in tokens[1]:

            if token not in negativeTokens:
                negativeTokens[token] = 1
            else:
                negativeTokens[token] += 1

        if rows % 1000 == 0 and rows != 0:

            writeResults(positiveTokens, "./positive" + str(rows) + ".csv")
            writeResults(negativeTokens, "./negative" + str(rows) + ".csv")

        if debug:
            t1 = datetime.now()
            print "Time: " + str(t1 - t0)
            print message.sentence
            print "positive: ", tokens[0]
            print "negative: ", tokens[1]
            print "\n------------------\n"

    writeResults(positiveTokens, "./positive.csv")
    writeResults(negativeTokens, "./negative.csv")
    print "done!"