def process(poi, timelines): poi.update( {}, {'$set': { "rliwc_anal.mined": False, "rliwc_anal.result": None }}, multi=True) poi.create_index([('timeline_auth_error_flag', pymongo.ASCENDING), ('rliwc_anal.mined', pymongo.ASCENDING)]) while True: # How many users whose timelines have not been processed by LIWC count = poi.find({ 'timeline_auth_error_flag': False, "rliwc_anal.mined": False }).count() if count == 0: break else: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining" for user in poi.find({ 'timeline_auth_error_flag': False, "rliwc_anal.mined": False }).limit(250): liwc = Liwc() textmass = "" for tweet in timelines.find({'user.id': user['id']}): # is it a retweet? # if not ('retweeted_status' in tweet): text = tweet['text'].encode('utf8') # text = re.sub(r"http\S+", "", text) # this doesn't do anything textmass = textmass + " " + text textmass = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", textmass).split()) textmass.lower() result = Liwc.summarize_document(liwc, textmass) # print result poi.update({'id': user['id']}, { '$set': { "rliwc_anal.mined": True, "rliwc_anal.result": result } })
# -*- coding: utf-8 -*- """ Created on 11:55 AM, 11/4/15 @author: wt """ from collections import Counter from lexicons.liwc import Liwc liwc_lexicon = Liwc() gettysburg = '''Four score and seven years ago our fathers brought forth on this continent a new nation, conceived in liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battlefield of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.''' # read_document() is a generator, but Counter will consume the whole thing # category_counts = Counter(liwc_lexicon.read_document(gettysburg)) # print 'Basic category counts: {}'.format(category_counts) # print out a tabulation that looks like the LIWC software's text report full_counts = liwc_lexicon.summarize_document(gettysburg) print full_counts # liwc_lexicon.print_summarization(full_counts) print Liwc.summarize_document(liwc_lexicon, gettysburg) print liwc_lexicon.summarize_document( '''Four score and seven years ago our fathers brought forth on this continent a new nation''')
}).count() if count == 0: break else: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining" for user in poi.find({ 'timeline_auth_error_flag': False, "liwc_anal.mined": False }).limit(250): #progcounter += 1 #if progcounter%1000 == 0: # print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + str(progcounter) liwc = Liwc() textmass = "" for tweet in timelines.find({'user.id': user['id']}): # is it a retweet? #if not ('retweeted_status' in tweet): text = tweet['text'].encode('utf8') # text = re.sub(r"http\S+", "", text) # this doesn't do anything textmass = textmass + " " + text # print tweet['text'].encode('utf8') textmass = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", textmass).split()) textmass.lower() result = Liwc.summarize_document(liwc, textmass)