Esempio n. 1
0
 def test_english_recognize(self):
     non_eng_user = '******'
     eng_user = '******'
     
     test_user_set = [non_eng_user, eng_user]
     
     dataroot = "/Users/yongjoo/workspace/tweets_process/data/"
     inputfile = dataroot + "tweetsRetrieved-May10-AlmostVerified.jsonarr"
     
     # collect tweets here; key is screen_name, and value is tweets
     collectedTweets = {}
     
     for line in open(inputfile):
         j = json.loads(line)
         
         screen_name = j['user']['screen_name']
         text = j['text']
         
         if screen_name in test_user_set:
             if screen_name in collectedTweets:
                 collectedTweets[screen_name] += " " + text
             else:
                 collectedTweets[screen_name] = text
                 
                 
     # collection finished; now start test
     self.assertFalse(pattern.isEnglish(collectedTweets[non_eng_user]),
                       "non eng user fail")
     self.assertTrue(pattern.isEnglish(collectedTweets[eng_user]),
                       "eng user fail")
def run():
    print "current directory: " + os.getcwd()
    
    dataroot = "../../../data/"
    
    # Collect tweets for each screen_name
    inputfile = dataroot + "retrievedTweets-Jun19-2.0.jsonarr"
    collectedTweets = collectTweets(inputfile)
    
    print "Completed collecting tweets: " + str(len(collectedTweets))
    
    # Collect ages for each screen_name
    inputfile = dataroot + "ageEmbededTweets-Jun19-sampled2.0.json"
    collectedAges = collectAges(inputfile)
    
    print "Completed collecting age data: " + str(len(collectedAges))
    
    # main data file to be written
    outputfile = dataroot + "age-tweets-Jun19-2.0.libsvm"
    out = open(outputfile, 'w')
    
    # related prior probabilities are written here
    prob_outputfile = dataroot + "age-tweets-prob-Jun19-2.0.csv"
    prob_out = open(prob_outputfile, 'w')
    
    # print out screen names finally used
    screen_name_file = dataroot + "screenname-matched-Jun19-2.0.txt"
    screenname_out = open(screen_name_file, 'w')
    
    #
    # Convert tweets into feature arrays
    #
    
    pruner = feature.Pruner()
    
    manager = feature.FeatureManager()
    non_english_count = 0
    
    # object to get prob array from screen_name
    screenNameToProbArray = ScreenNameToProbArray()
    
    
    for screen_name, tweets in collectedTweets.iteritems():   
        if not pattern.isEnglish(tweets):
            non_english_count += 1
            continue
        
        probArray = screenNameToProbArray.getProbArrayFor(screen_name)
        
        farr_text = manager.convertTextIntoFeatureArray(tweets)
#        farr_name = manager.convertFirstNameProbIntoFeatureArray(probArray)
        
#        farr_text.appendFeatureArray(farr_name)
        
        pruner.recordFeatureArray(screen_name, farr_text)
        pruner.recordProbArray(screen_name, probArray)
        
    print str(non_english_count) + " number of non-English tweets have been dropped."
        
    # instead of feature pruning based on number of counts, we use feature
    # selection based on information gain supported by mallet.
#    pruner.prune(5)
    
    print "Completed converting text into feature arrays"
    
    
    # print out a set of indexed words and their labels
    indexoutfile = dataroot + "age-tweets-indexedWords-Jun19-2.0.txt"
    indexout = open(indexoutfile, 'w')
    indexout.write(str(manager.indexer))
    indexout.close()

    
    
    # Record the features in libsvm format
    # utilize the fact that pruner is made iterable
    age_group_count = {0:0, 1:0, 2:0, 3:0}
    
    for screen_name, farr in pruner:
        try:
            age = collectedAges[screen_name]
        except KeyError:
            print "Anyway retrieved key: " + screen_name
            continue
        
        # write feature file    
        age_group = feature.ageToAgeGroup(age)
        age_group_count[age_group] += 1
        
        out.write(str(age_group) + " ")
        
        for label, value in farr:
            out.write(str(label) + ":" + str(value) + " ")
    
        out.write("\n")
        
        # write name prob file
        prob_array = pruner.getProbArray(screen_name)
        
        for i in range(len(prob_array)):
            prob_out.write(str(prob_array[i]))
            
            if i != len(prob_array) - 1:
                prob_out.write(",")
        
        prob_out.write("\n")

        # write screen_name
        screenname_out.write(screen_name + '\n')
        
    
    print "Completed writing the output file to: " + outputfile
    
    print "Collected Age Groups:"
    for group, count in age_group_count.iteritems():
        print str(group) + ": " + str(count)
        
    # Done
    out.close()
    prob_out.close()
    screenname_out.close()
        
    print "Done"
def run():
    dataroot = "/Users/yongjoo/workspace/tweets_process/data/"
    
    # Collect tweets for each screen_name
    inputfile = dataroot + "tweetsRetrieved-May10-AlmostVerified.jsonarr"
    collectedTweets = collectTweets(inputfile)
    
    print "Completed collecting tweets: " + str(len(collectedTweets))
    
    # Collect ages for each screen_name
    inputfile = dataroot + "ageEmbededTweets-May10-AlmostVerified.json"
    collectedAges = collectAges(inputfile)
    
    print "Completed collecting age data: " + str(len(collectedAges))
    
    outputfile = dataroot + "age-tweets.libsvm"
    out = open(outputfile, 'w')
    
    # Convert tweets into feature arrays
    
    # key: screen_name
    # value: FeatureArray instance
    pruner = feature.Pruner()
    
    manager = feature.FeatureManager()
    
    for screen_name, tweets in collectedTweets.iteritems():   
        if not pattern.isEnglish(tweets):
            continue
             
        farr = manager.convertTextIntoFeatureArray(tweets)
        pruner.recordFeatureArray(screen_name, farr)
        
    pruner.prune(5)
    
    print "Completed converting text into feature arrays"
    
    # Record the features in libsvm format
    # use that pruner is made iterable
    for screen_name, farr in pruner:
        try:
            age = collectedAges[screen_name]
        except KeyError:
            print "Anyway retrieved key: " + screen_name
            continue
            
        age_group = feature.ageToAgeGroup(age)
        
        out.write(str(age_group) +" ")
        
        for label, value in farr:
            out.write(str(label) + ":" + str(value) + " ")
    
        out.write("\n")
    
    print "Completed writing the output file to: " + outputfile
        
    # Done
    out.close()
        
    print "Done"