Beispiel #1
0
def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets:
        1. Identify target
        2. If target is one of the politicians infer the comment's polarity
        
        politiciansFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
         tweets -> list of tweets
    """
    
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = None#getFromCache(WIN_PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, WIN_PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = None#getFromCache(WIN_SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
       
        putInCache(sentiTokens, WIN_SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = None#getFromCache(WIN_MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, WIN_MULTIWORD_CACHE)
    
    print  "Inferring polarity..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in tweets:
        
        t0 = datetime.now()
        
        rulesScore,rulesInfo = rules.getRulesScore(tweet,True)
        cluesScore,clueInfo = rules.getCluesScore(tweet,True)        
        sentiScore,sentiInfo = naive.getSentiScore(tweet,True)
        
        tweetScore = int(sentiScore) + int(rulesScore) + int(cluesScore)
        
        if tweetScore > 0:
            tweet.polarity = 1
        elif tweetScore < 0:
            tweet.polarity = -1
        else:
            tweet.polarity = 0
        
        tweet.metadata = sentiInfo+";"+clueInfo+";"+rulesInfo 
        
        if tweet.polarity == 0:
            
            regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
            
            match = re.search(regex,tweet.metadata).group(2)
            
            if len(match.strip(' ')) == 0:

                rejectedTweets.append(tweet)
            else:
                analyzedTweets.append(tweet)
        else:
            analyzedTweets.append(tweet)
        
        t1 = datetime.now()
        
        print tweet.id + " ("+ str(t1-t0) + ")"
        
    logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets            
Beispiel #2
0
def processTweets_old(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets:
        1. Identify target
        2. If target is one of the politicians infer the comment's polarity
        
        politiciansFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
         tweets -> list of tweets
    """
    
    print "hell yeah!"
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print  "Inferring polarity..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in tweets:
        
        t0 = datetime.now()
        
        #print "antes"
        
        tweetsWithTarget = naive.inferTarget(tweet)
        
        if tweetsWithTarget != None :
            
            #a tweet can have multiple targets (in that case the message is replicated)
            for tweet in tweetsWithTarget:       
        
                #try to classify with rules...
                analyzedTweet = rules.inferPolarity(tweet,False)
                print "depois"
                #if not possible use the naive classifier
                if analyzedTweet.polarity == 0:
                    analyzedTweet = naive.inferPolarity(analyzedTweet,False)
                
                if analyzedTweet.polarity == 0:
                    
                    regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
                    
                    match = re.search(regex,analyzedTweet.metadata).group(2)
                    print "match: ", match
                    if len(match.strip(' ')) == 0:
        
                        rejectedTweets.append(analyzedTweet)
                    else:
                        analyzedTweets.append(analyzedTweet)
                else:
                    analyzedTweets.append(analyzedTweet)
                
                t1 = datetime.now()
            
                print tweet.id + " ("+ str(t1-t0) + ")"
            
    logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets    
Beispiel #3
0
def getFeatures(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,listOfTweets):
    
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = None#getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = None#getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = None#getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print  "Calculating features..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in listOfTweets:
        
        feats = []
        
        t0 = datetime.now()
        
        rulesFeats = rules.getRulesFeats(tweet,True)
        cluesFeats = rules.getCluesFeats(tweet,True)        
        sentiFeats = naive.getSentiFeats(tweet,True)
        
        x = int(rulesFeats[0])+int(rulesFeats[1])+int(cluesFeats[0])+int(cluesFeats[1])+int(sentiFeats[0])+int(sentiFeats[1])+int(sentiFeats[2])
        
        if x == 0:
            tweetTest = naive.inferPolarity(tweet, True)
            regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
            
            match = re.search(regex,tweetTest.metadata).group(2)
            
            if len(match.strip(' ')) == 0:
                
                rejectedTweets.append(tweet)
                print "rejected: "
                print tweet.tostring()
            else:
                feats.append(tweet.id)
                feats.append(rulesFeats[0])
                feats.append(rulesFeats[1])
                feats.append(cluesFeats[0])
                feats.append(cluesFeats[1])
                feats.append(sentiFeats[0])
                feats.append(sentiFeats[1])
                feats.append(sentiFeats[2])
                feats.append(int(tweet.irony))
                
                analyzedTweets.append(feats)
        else:
            feats.append(tweet.id)
            feats.append(rulesFeats[0])
            feats.append(rulesFeats[1])
            feats.append(cluesFeats[0])
            feats.append(cluesFeats[1])
            feats.append(sentiFeats[0])
            feats.append(sentiFeats[1])
            feats.append(sentiFeats[2])
            feats.append(int(tweet.irony))
            
            analyzedTweets.append(feats)
        
        t1 = datetime.now()
        
        print tweet.id + " ("+ str(t1-t0) + ")"
        
    #logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets