Python SentiTokens.loadSentiTokens 예제들

예제 #1

0

파일 보기

파일: Tweetonizer_older.py 프로젝트: craighagerman/reaction-tweetonizer

def processSingleSentence(politiciansFile, sentiTokensFile, exceptSentiTokens,
                          sentence, webOutput):

    print "<br>Loading resources...<br>Politicians: " + politiciansFile
    politicians = Persons.loadPoliticians(politiciansFile)

    print "<br>SentiTokens: " + sentiTokensFile + "<br>ExceptTokens: " + exceptSentiTokens
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                              exceptSentiTokens)

    naive = Naive(politicians, sentiTokens)

    singleSentence = Opinion(1, sentence=sentence)
    targets = naive.inferTarget(singleSentence)
    results = []

    print "<br>Inferring targets...<br>"

    if targets != None:

        print "<br>Inferring polarity...<br>"

        for target in targets:

            rules = Rules(politicians, sentiTokens)
            results.append(rules.inferPolarity(target, False))
    else:
        print "<br>No targets were identified...<br>"
    if webOutput:
        return printResultsWeb(results, sentence)
    else:
        return printResultsConsole(results)

예제 #2

0

파일 보기

파일: Tweetonizer_older.py 프로젝트: craighagerman/reaction-tweetonizer

def processSingleSentence(politiciansFile,sentiTokensFile,exceptSentiTokens,sentence,webOutput):
            
    print "<br>Loading resources...<br>Politicians: " + politiciansFile    
    politicians = Persons.loadPoliticians(politiciansFile)
    
    print "<br>SentiTokens: " + sentiTokensFile + "<br>ExceptTokens: " +  exceptSentiTokens
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
    
    naive = Naive(politicians,sentiTokens)
    
    singleSentence = Opinion(1, sentence=sentence)
    targets = naive.inferTarget(singleSentence)
    results = []
    
    print "<br>Inferring targets...<br>"
        
    if targets != None:    
        
        print "<br>Inferring polarity...<br>"
        
        for target in targets:
            
            rules = Rules(politicians,sentiTokens)
            results.append(rules.inferPolarity(target, False))
    else:
        print "<br>No targets were identified...<br>"
    if webOutput:
        return printResultsWeb(results,sentence)
    else:
        return printResultsConsole(results)

예제 #3

0

파일 보기

파일: Tests.py 프로젝트: craighagerman/reaction-tweetonizer

def getSentiTokens():
    
    sentiTokensFile = "../Resources/sentiTokens-2011-05-30.txt"
    exceptTokensFile = "../Resources/SentiLexAccentExcpt.txt"
    
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptTokensFile)
    
    return sentiTokens

예제 #4

0

파일 보기

파일: sentimentFeaturesExtractor.py 프로젝트: craighagerman/reaction-tweetonizer

def getSentiTokens():
    
    sentiTokensFile = "../Resources/SentiLex-flex-PT03.txt"
    exceptTokensFile = "../Resources/SentiLexAccentExcpt.txt"
    
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptTokensFile)
    
    return sentiTokens

예제 #5

0

파일 보기

파일: Tests.py 프로젝트: craighagerman/reaction-tweetonizer

def getSentiTokens():

    sentiTokensFile = "../Resources/sentiTokens-2011-05-30.txt"
    exceptTokensFile = "../Resources/SentiLexAccentExcpt.txt"

    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                              exceptTokensFile)

    return sentiTokens

예제 #6

0

파일 보기

파일: sentimentFeaturesExtractor.py 프로젝트: craighagerman/reaction-tweetonizer

def getSentiTokens():

    sentiTokensFile = "../Resources/SentiLex-flex-PT03.txt"
    exceptTokensFile = "../Resources/SentiLexAccentExcpt.txt"

    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                              exceptTokensFile)

    return sentiTokens

예제 #7

0

파일 보기

파일: Tests.py 프로젝트: craighagerman/reaction-tweetonizer

def getNaiveClassifier(politicians, sentiTokens):

    politiciansFile = "../Resources/politicians.txt"
    sentiTokensFile = "../Resources/sentiTokens-2011-05-30.txt"
    exceptTokensFile = "../Resources/SentiLexAccentExcpt.txt"
    
    politicians = Persons.loadPoliticians(politiciansFile)
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptTokensFile)
    
    return Opinionizers.Naive(politicians,sentiTokens)

예제 #8

0

파일 보기

파일: Tests.py 프로젝트: craighagerman/reaction-tweetonizer

def getNaiveClassifier(politicians, sentiTokens):

    politiciansFile = "../Resources/politicians.txt"
    sentiTokensFile = "../Resources/sentiTokens-2011-05-30.txt"
    exceptTokensFile = "../Resources/SentiLexAccentExcpt.txt"

    politicians = Persons.loadPoliticians(politiciansFile)
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                              exceptTokensFile)

    return Opinionizers.Naive(politicians, sentiTokens)

예제 #9

0

파일 보기

def processSingleSentence(politiciansFile, sentiTokensFile, exceptSentiTokens,
                          sentence, webOutput):

    print "Loading resources..."
    print "Politicians: " + politiciansFile
    politicians = Persons.loadPoliticians(politiciansFile)

    print "SentiTokens: " + sentiTokensFile
    print "ExceptTokens: " + exceptSentiTokens
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                              exceptSentiTokens)

    naive = Naive(politicians, sentiTokens)

    singleSentence = Opinion(1, sentence=sentence)

    print "Inferring targets..."
    targets = naive.inferTarget(singleSentence)

    results = []

    if targets != None:

        print "Inferring polarity..."

        for target in targets:

            rules = Rules(politicians, sentiTokens)

            #if not possible to classify with rules use the naive classifier
            classifiedTweet = rules.inferPolarity(target, False)

            if classifiedTweet.polarity == 0:
                classifiedTweet = naive.inferPolarity(classifiedTweet, True)

            results.append(classifiedTweet)
    else:
        print "No targets were identified..."
    if webOutput:
        return printResultsWeb(results, sentence)
    else:
        return printResultsConsole(results)

예제 #10

0

파일 보기

파일: Tweetonizer_old.py 프로젝트: craighagerman/reaction-tweetonizer

def processSingleSentence(politiciansFile,sentiTokensFile,exceptSentiTokens,sentence,webOutput):
            
    print "Loading resources..."
    print "Politicians: " + politiciansFile    
    politicians = Persons.loadPoliticians(politiciansFile)
    
    print "SentiTokens: " + sentiTokensFile 
    print "ExceptTokens: " +  exceptSentiTokens
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
    
    naive = Naive(politicians,sentiTokens)
    
    singleSentence = Opinion(1, sentence=sentence)
    
    print "Inferring targets..."
    targets = naive.inferTarget(singleSentence)
            
    results = []
        
    if targets != None:    
        
        print "Inferring polarity..."
        
        for target in targets:
            
            rules = Rules(politicians,sentiTokens)
            
            #if not possible to classify with rules use the naive classifier
            classifiedTweet = rules.inferPolarity(target, False)
            
            if classifiedTweet.polarity == 0:
                classifiedTweet = naive.inferPolarity(classifiedTweet,True)
            
            results.append(classifiedTweet)
    else:
        print "No targets were identified..."
    if webOutput:
        return printResultsWeb(results,sentence)
    else:
        return printResultsConsole(results)

예제 #11

0

파일 보기

파일: Tweetonizer_older.py 프로젝트: craighagerman/reaction-tweetonizer

def processTweets(politiciansFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets:
        1. Identify target
        2. If target is one of the politicians infer the comment's polarity
        
        politiciansFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
         tweets -> list of tweets
    """
    
    print "Loading resources...\nPoliticians: " + politiciansFile   
    politicians = Persons.loadPoliticians(politiciansFile)
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
    
    print "Multiword Tokenizer " + multiWordsFile    
    multiWordTokenizer = MultiWordHandler(multiWordsFile)
    multiWordTokenizer.addMultiWords(Persons.getMultiWords(politicians))
    multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
    
    naive = Naive(politicians,sentiTokens)    
    
    targetedTweets = {}    
    classifiedTweets = {}
    
    #Process tweets...
    #First step: infer targets and create a dictionary {target,listOfTweets}    
    print "Identifying targets..."
     
    for tweet in tweets:        
        
        tweetsWithTarget = naive.inferTarget(tweet)
        
        if tweetsWithTarget != None :
            
            #a tweet can have multiple targets (in that case the message is replicated)
            for tweet in tweetsWithTarget:
            
                if tweet.target not in targetedTweets:                    
                
                    targetedTweets[tweet.target] = []
                
                tweet.taggedSentence = multiWordTokenizer.tokenizeMultiWords(tweet.sentence)  
                targetedTweets[tweet.target].append(tweet)
    
    print  len(targetedTweets), " targets Identified! Inferring polarity..."
    
    rules = Rules(politicians,sentiTokens)
    
    #Second step infer polarity 
    for target,tweets in targetedTweets.items():
        
        for tweet in tweets: 
            
            if target not in classifiedTweets:
                classifiedTweets[target]  = []
            
            #try to classify with rules...
            classifiedTweet = rules.inferPolarity(tweet,True)
            
            #if not possible use the naive classifier
            if classifiedTweet.polarity == 0:
                classifiedTweet = naive.inferPolarity(classifiedTweet,True)
            
            classifiedTweets[target].append(classifiedTweet)
            
    return classifiedTweets

예제 #12

0

파일 보기

def processTweets_old(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets:
        1. Identify target
        2. If target is one of the politicians infer the comment's polarity
        
        politiciansFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
         tweets -> list of tweets
    """
    
    print "hell yeah!"
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print  "Inferring polarity..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in tweets:
        
        t0 = datetime.now()
        
        #print "antes"
        
        tweetsWithTarget = naive.inferTarget(tweet)
        
        if tweetsWithTarget != None :
            
            #a tweet can have multiple targets (in that case the message is replicated)
            for tweet in tweetsWithTarget:       
        
                #try to classify with rules...
                analyzedTweet = rules.inferPolarity(tweet,False)
                print "depois"
                #if not possible use the naive classifier
                if analyzedTweet.polarity == 0:
                    analyzedTweet = naive.inferPolarity(analyzedTweet,False)
                
                if analyzedTweet.polarity == 0:
                    
                    regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
                    
                    match = re.search(regex,analyzedTweet.metadata).group(2)
                    print "match: ", match
                    if len(match.strip(' ')) == 0:
        
                        rejectedTweets.append(analyzedTweet)
                    else:
                        analyzedTweets.append(analyzedTweet)
                else:
                    analyzedTweets.append(analyzedTweet)
                
                t1 = datetime.now()
            
                print tweet.id + " ("+ str(t1-t0) + ")"
            
    logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets

예제 #13

0

파일 보기

def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets:
        1. Identify target
        2. If target is one of the politicians infer the comment's polarity
        
        politiciansFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
         tweets -> list of tweets
    """
    
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = None#getFromCache(WIN_PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, WIN_PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = None#getFromCache(WIN_SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
       
        putInCache(sentiTokens, WIN_SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = None#getFromCache(WIN_MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, WIN_MULTIWORD_CACHE)
    
    print  "Inferring polarity..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in tweets:
        
        t0 = datetime.now()
        
        rulesScore,rulesInfo = rules.getRulesScore(tweet,True)
        cluesScore,clueInfo = rules.getCluesScore(tweet,True)        
        sentiScore,sentiInfo = naive.getSentiScore(tweet,True)
        
        tweetScore = int(sentiScore) + int(rulesScore) + int(cluesScore)
        
        if tweetScore > 0:
            tweet.polarity = 1
        elif tweetScore < 0:
            tweet.polarity = -1
        else:
            tweet.polarity = 0
        
        tweet.metadata = sentiInfo+";"+clueInfo+";"+rulesInfo 
        
        if tweet.polarity == 0:
            
            regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
            
            match = re.search(regex,tweet.metadata).group(2)
            
            if len(match.strip(' ')) == 0:

                rejectedTweets.append(tweet)
            else:
                analyzedTweets.append(tweet)
        else:
            analyzedTweets.append(tweet)
        
        t1 = datetime.now()
        
        print tweet.id + " ("+ str(t1-t0) + ")"
        
    logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets

예제 #14

0

파일 보기

def getFeatures(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,listOfTweets):
    
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = None#getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = None#getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = None#getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHandler(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print  "Calculating features..."
    
    naive = Naive(targets,sentiTokens)
    rules = Rules(targets,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in listOfTweets:
        
        feats = []
        
        t0 = datetime.now()
        
        rulesFeats = rules.getRulesFeats(tweet,True)
        cluesFeats = rules.getCluesFeats(tweet,True)        
        sentiFeats = naive.getSentiFeats(tweet,True)
        
        x = int(rulesFeats[0])+int(rulesFeats[1])+int(cluesFeats[0])+int(cluesFeats[1])+int(sentiFeats[0])+int(sentiFeats[1])+int(sentiFeats[2])
        
        if x == 0:
            tweetTest = naive.inferPolarity(tweet, True)
            regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
            
            match = re.search(regex,tweetTest.metadata).group(2)
            
            if len(match.strip(' ')) == 0:
                
                rejectedTweets.append(tweet)
                print "rejected: "
                print tweet.tostring()
            else:
                feats.append(tweet.id)
                feats.append(rulesFeats[0])
                feats.append(rulesFeats[1])
                feats.append(cluesFeats[0])
                feats.append(cluesFeats[1])
                feats.append(sentiFeats[0])
                feats.append(sentiFeats[1])
                feats.append(sentiFeats[2])
                feats.append(int(tweet.irony))
                
                analyzedTweets.append(feats)
        else:
            feats.append(tweet.id)
            feats.append(rulesFeats[0])
            feats.append(rulesFeats[1])
            feats.append(cluesFeats[0])
            feats.append(cluesFeats[1])
            feats.append(sentiFeats[0])
            feats.append(sentiFeats[1])
            feats.append(sentiFeats[2])
            feats.append(int(tweet.irony))
            
            analyzedTweets.append(feats)
        
        t1 = datetime.now()
        
        print tweet.id + " ("+ str(t1-t0) + ")"
        
    #logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets

예제 #15

0

파일 보기

파일: Tweetonizer_older.py 프로젝트: craighagerman/reaction-tweetonizer

def processTweets(politiciansFile, sentiTokensFile, exceptSentiTokens,
                  multiWordsFile, tweets):
    """ 
        Processes a list of tweets:
        1. Identify target
        2. If target is one of the politicians infer the comment's polarity
        
        politiciansFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
         tweets -> list of tweets
    """

    print "Loading resources...\nPoliticians: " + politiciansFile
    politicians = Persons.loadPoliticians(politiciansFile)

    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens
    sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                              exceptSentiTokens)

    print "Multiword Tokenizer " + multiWordsFile
    multiWordTokenizer = MultiWordHandler(multiWordsFile)
    multiWordTokenizer.addMultiWords(Persons.getMultiWords(politicians))
    multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))

    naive = Naive(politicians, sentiTokens)

    targetedTweets = {}
    classifiedTweets = {}

    #Process tweets...
    #First step: infer targets and create a dictionary {target,listOfTweets}
    print "Identifying targets..."

    for tweet in tweets:

        tweetsWithTarget = naive.inferTarget(tweet)

        if tweetsWithTarget != None:

            #a tweet can have multiple targets (in that case the message is replicated)
            for tweet in tweetsWithTarget:

                if tweet.target not in targetedTweets:

                    targetedTweets[tweet.target] = []

                tweet.taggedSentence = multiWordTokenizer.tokenizeMultiWords(
                    tweet.sentence)
                targetedTweets[tweet.target].append(tweet)

    print len(targetedTweets), " targets Identified! Inferring polarity..."

    rules = Rules(politicians, sentiTokens)

    #Second step infer polarity
    for target, tweets in targetedTweets.items():

        for tweet in tweets:

            if target not in classifiedTweets:
                classifiedTweets[target] = []

            #try to classify with rules...
            classifiedTweet = rules.inferPolarity(tweet, True)

            #if not possible use the naive classifier
            if classifiedTweet.polarity == 0:
                classifiedTweet = naive.inferPolarity(classifiedTweet, True)

            classifiedTweets[target].append(classifiedTweet)

    return classifiedTweets

예제 #16

0

파일 보기

def processTweets(targetsFile,sentiTokensFile,exceptSentiTokens,multiWordsFile,tweets):
    
    """ 
        Processes a list of tweets, for each:
        1. Identifies the target
        2. If the message contains a target of interest infer the polarity
        
        targetsFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
        multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro"
         tweets -> list of tweets
    """
    
    print "hell yeah!"
    print "Loading resources...\nTargets: " + targetsFile
        
    targets = Utils.getFromCache(PERSONS_CACHE)
    
    if targets != None:
        print "Target list found on cache!"
    else:
        targets = Persons.loadPoliticians(targetsFile)
        Utils.putInCache(targets, PERSONS_CACHE) 
    
    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " +  exceptSentiTokens
    
    sentiTokens = Utils.getFromCache(SENTI_CACHE)  
    
    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,exceptSentiTokens)
        Utils.putInCache(sentiTokens, SENTI_CACHE)
    
    print "Multiword Tokenizer: " + multiWordsFile
    
    multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE)
    
    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHelper(multiWordsFile)
        multiWordTokenizer.addMultiWords(Persons.getMultiWords(targets))
        multiWordTokenizer.addMultiWords(SentiTokens.getMultiWords(sentiTokens))
        Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE)
    
    print "Resources loaded! Starting analysis..."
    
    
    targetDetector = TargetDetector(targets)
    #TODO:Estes senhores já não precisam de receber os targets
    naive = Naive(sentiTokens)
    rules = Rules(None,sentiTokens)   
    
    analyzedTweets = []
    rejectedTweets = []
    
    for tweet in tweets:
        
        t0 = datetime.now()
        
        tweetsWithTarget = targetDetector.inferTarget(tweet)
        
        if tweetsWithTarget != None :
            
            #a tweet can have multiple targets (in that case the message is replicated)
            for tweet in tweetsWithTarget:       
        
                #try to classify with rules...
                analyzedTweet = rules.inferPolarity(tweet,False)
                
                #if not possible use the naive classifier
                if analyzedTweet.polarity == 0:
                    analyzedTweet = naive.inferPolarity(analyzedTweet,False)
                
                #If the polarity is still 0 it can mean:
                #1) The sum of the polarities of the sentiTokens is 0,
                #2) There was no evidence usable to assess the sentiment                
                if analyzedTweet.polarity == 0:
                    
                    regex = ur'(\W|^)sentiTokens:(.*?);(\W|$)'            
                    
                    #Try to find if there are any evidence of matched sentiTokens
                    match = re.search(regex,analyzedTweet.metadata).group(2)
                    
                    if debug:
                        print "match: ", match
                    
                    if len(match.strip(' ')) == 0:
        
                        rejectedTweets.append(analyzedTweet)
                    else:
                        analyzedTweets.append(analyzedTweet)
                else:
                    analyzedTweets.append(analyzedTweet)
                
                t1 = datetime.now()
            
                print tweet.id + " ("+ str(t1-t0) + ")"
            
    logClassifiedTweets(rejectedTweets, "./rejectedTweets.csv")    
    
    return analyzedTweets

예제 #17

0

파일 보기

파일: PPCTopSentiTokens.py 프로젝트: craighagerman/reaction-tweetonizer

def processComments(sentiTokensFile, exceptSentiTokens, multiWordsFile,
                    messages):
    """ 
        Processes a list of tweets, for each:
        1. Identifies the target
        2. If the message contains a target of interest infer the polarity
        
        targetsFile -> path to the politicians list file
        sentiTokensFile -> path to the sentiTokens list file
        exceptSentiTokens -> path to the list of sentiTokens that cannot lose their accents without
                             causing ambiguity for ex: más -> mas
        multiWordsFile -> path to a file that contains the words that should be considered as a unit, e.g. "primeiro ministro"
         tweets -> list of tweets
    """

    if debug:
        print "DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG \n\n"

    print "Loading resources..."

    print "SentiTokens: " + sentiTokensFile + "\nExceptTokens: " + exceptSentiTokens

    sentiTokens = Utils.getFromCache(SENTI_CACHE)

    if sentiTokens != None:
        print "SentiTokens found on cache!"
    else:
        sentiTokens = SentiTokens.loadSentiTokens(sentiTokensFile,
                                                  exceptSentiTokens)
        Utils.putInCache(sentiTokens, SENTI_CACHE)

    print "Multiword Tokenizer: " + multiWordsFile

    multiWordTokenizer = Utils.getFromCache(MULTIWORD_CACHE)

    if multiWordTokenizer != None:
        print "Multiword Tokenizer found on cache"
    else:
        multiWordTokenizer = MultiWordHelper(multiWordsFile)
        multiWordTokenizer.addMultiWords(
            SentiTokens.getMultiWords(sentiTokens))
        Utils.putInCache(multiWordTokenizer, MULTIWORD_CACHE)

    print "Resources loaded! Starting analysis..."

    naive = Naive(sentiTokens)
    #rules = Rules(None,sentiTokens)
    rows = 0

    positiveTokens = {}
    negativeTokens = {}

    for message in messages:

        rows += 1

        t0 = datetime.now()

        tokens = naive.tokenizeSentiTokens(message, True)

        for token in tokens[0]:

            if token not in positiveTokens:
                positiveTokens[token] = 1
            else:
                positiveTokens[token] += 1

        for token in tokens[1]:

            if token not in negativeTokens:
                negativeTokens[token] = 1
            else:
                negativeTokens[token] += 1

        if rows % 1000 == 0 and rows != 0:

            writeResults(positiveTokens, "./positive" + str(rows) + ".csv")
            writeResults(negativeTokens, "./negative" + str(rows) + ".csv")

        if debug:
            t1 = datetime.now()
            print "Time: " + str(t1 - t0)
            print message.sentence
            print "positive: ", tokens[0]
            print "negative: ", tokens[1]
            print "\n------------------\n"

    writeResults(positiveTokens, "./positive.csv")
    writeResults(negativeTokens, "./negative.csv")
    print "done!"