Exemple #1
0
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer('[^\w\']+', gaps=True)
en_stopwords = stopwords.words('english')

output_dir = argv[1]
users = tweets.distinct('user.screen_name')

word2num = {}
num2word = []
word_count = []

feeds = []
for user in users:
	user_feed = []
	for tweet in tweets.find({'user.screen_name': user}, fields={'text': 1}).sort( [('id', 1)] ):
		text = tweet['text']
		text = tokenizer.tokenize(text)
		words = [lemmatizer.lemmatize(w.lower().encode('utf8')) for w in text if not w in en_stopwords]

		num_repr = []
		for word in words:
			wid = word2num.get(word)
			if wid == None:
				wid = len(num2word) + 1
				word2num[word] = wid
				num2word.append(word)
				word_count.append(0)
			word_count[wid-1] += 1
			num_repr.append(wid)
Exemple #2
0
target_user = argv[1]

search_params = dict(count=200, result_type='recent',
    include_entities=True, include_rts=False, exclude_replies=True)
search_params['screen_name'] = target_user
max_count = 3200
if len(argv) > 2:
    max_count = float(argv[2])

direction = "past"

twitter = Twitter(auth=auth)

while True:
    tweet_count = tweets.find({'user.screen_name': target_user}).count()
    print 'tweets in database for user %s: %d' % (target_user, tweet_count)
    if tweet_count > max_count:
        print 'max_count %d reached. stopping' % (max_count)
        break

    if direction == "past":
        last_tweet = tweets.find_one({'user.screen_name': target_user}, 
            fields={'id': 1}, sort=[('id', 1)])

        if last_tweet == None or len(last_tweet) < 1:
            last_tweet = maxint
        else:
            last_tweet = last_tweet['id']
        print 'last loaded id: %d' % last_tweet