Example #1
0
def getFeatures(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,listOfTweets):
    
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = None#getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = None#getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = None#getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print  "Calculating features..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in listOfTweets:
        
        feats = []
        
        t0 = datetime.now()
        
        rulesFeats = rules.getRulesFeats(tweet,True)
        cluesFeats = rules.getCluesFeats(tweet,True)        
        sentiFeats = naive.getSentiFeats(tweet,True)
        
        x = int(rulesFeats[0])+int(rulesFeats[1])+int(cluesFeats[0])+int(cluesFeats[1])+int(sentiFeats[0])+int(sentiFeats[1])+int(sentiFeats[2])
        
        if x == 0:
            tweetTest = naive.inferPolarity(tweet, True)
            regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
            
            match = re.search(regex,tweetTest.metadata).group(2)
            
            if len(match.strip(' ')) == 0:
                
                rejectedTweets.append(tweet)
                print "rejected: "
                print tweet.tostring()
            else:
                feats.append(tweet.id)
                feats.append(rulesFeats[0])
                feats.append(rulesFeats[1])
                feats.append(cluesFeats[0])
                feats.append(cluesFeats[1])
                feats.append(sentiFeats[0])
                feats.append(sentiFeats[1])
                feats.append(sentiFeats[2])
                feats.append(int(tweet.irony))
                
                analyzedTweets.append(feats)
        else:
            feats.append(tweet.id)
            feats.append(rulesFeats[0])
            feats.append(rulesFeats[1])
            feats.append(cluesFeats[0])
            feats.append(cluesFeats[1])
            feats.append(sentiFeats[0])
            feats.append(sentiFeats[1])
            feats.append(sentiFeats[2])
            feats.append(int(tweet.irony))
            
            analyzedTweets.append(feats)
        
        t1 = datetime.now()
        
        print tweet.id + " ("+ str(t1-t0) + ")"
        
    #logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets