def update_timeline(user_id, must_include=lambda x: True): """ Try to download the entire timeline of the use starting from a given page. Before starting issuing requests the last tweet_id of the user is retrieved if present. @return a TwitterResponse """ # Here we just need to load the first line and get the id_str of the first # tweet. We are also assuming the file is there and accassible log.msg("Downloading updates of user_id %d" % user_id) writer = TimelineFile(user_id) abort = False try: abort = (writer.get_total() == 0) first_tweet = writer.get_first() since_id = int(first_tweet['id_str']) except Exception, exc: abort = True
def crawl_timeline(user_id, must_include=lambda x: True): """ Try to download the entire timeline of the use starting from a given page. Before starting issuing requests the last tweet_id of the user is retrieved if present. @return a TwitterResponse """ log.msg("Fetching timeline of user_id %d" % user_id) writer = TimelineFile(user_id) max_id = '' total_tweets = writer.get_total() try: last_tweet_id = int(writer.get_last()['id_str']) - 1 except: log.msg("This seems to be a new timeline file") last_tweet_id = -1 msg, timeline, sleep_time = fetch_timeline(user_id=user_id, last_tweet_id=last_tweet_id) total_included = 0 total_fetched = len(timeline) total_tweets += total_included timeline = filter(must_include, timeline) total_included = len(timeline) writer.add_tweets(timeline) # Signal completion must_include(None) response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, 0, sleep_time) if total_fetched >= 2: screen_name = timeline[0]['user']['screen_name'] first_tweet = timeline[0]['text'].replace('\n', '').replace( '\r', '').replace('\t', '').encode('utf8') last_tweet = timeline[-1]['text'].replace('\n', '').replace( '\r', '').replace('\t', '').encode('utf8') # TODO: We could add some statics like the number of hashtags and so on. # but may be we could exploits the pub/sub architecture. Other option is # to use directly the must_follow callback to collect statistics log.msg("Got %d tweets for user_id %d screen_name %s" % (total_fetched, user_id, screen_name)) log.msg(" First tweet: '%s'" % first_tweet) log.msg(" Last tweet: '%s'" % last_tweet) response['timeline.total_included'] = total_included response['timeline.total_fetched'] = total_fetched if response.status != STATUS_ERROR and total_fetched > 0: writer.commit() return response