lemmatizer = WordNetLemmatizer() tokenizer = RegexpTokenizer('[^\w\']+', gaps=True) en_stopwords = stopwords.words('english') output_dir = argv[1] users = tweets.distinct('user.screen_name') word2num = {} num2word = [] word_count = [] feeds = [] for user in users: user_feed = [] for tweet in tweets.find({'user.screen_name': user}, fields={'text': 1}).sort( [('id', 1)] ): text = tweet['text'] text = tokenizer.tokenize(text) words = [lemmatizer.lemmatize(w.lower().encode('utf8')) for w in text if not w in en_stopwords] num_repr = [] for word in words: wid = word2num.get(word) if wid == None: wid = len(num2word) + 1 word2num[word] = wid num2word.append(word) word_count.append(0) word_count[wid-1] += 1 num_repr.append(wid)
target_user = argv[1] search_params = dict(count=200, result_type='recent', include_entities=True, include_rts=False, exclude_replies=True) search_params['screen_name'] = target_user max_count = 3200 if len(argv) > 2: max_count = float(argv[2]) direction = "past" twitter = Twitter(auth=auth) while True: tweet_count = tweets.find({'user.screen_name': target_user}).count() print 'tweets in database for user %s: %d' % (target_user, tweet_count) if tweet_count > max_count: print 'max_count %d reached. stopping' % (max_count) break if direction == "past": last_tweet = tweets.find_one({'user.screen_name': target_user}, fields={'id': 1}, sort=[('id', 1)]) if last_tweet == None or len(last_tweet) < 1: last_tweet = maxint else: last_tweet = last_tweet['id'] print 'last loaded id: %d' % last_tweet