Example #1
0
def crawl_timeline(user_id, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """

    log.msg("Fetching timeline of user_id %d" % user_id)

    writer = TimelineFile(user_id)

    max_id = ''
    total_tweets = writer.get_total()

    try:
        last_tweet_id = int(writer.get_last()['id_str']) - 1
    except:
        log.msg("This seems to be a new timeline file")
        last_tweet_id = -1

    msg, timeline, sleep_time = fetch_timeline(user_id=user_id, last_tweet_id=last_tweet_id)

    total_included = 0
    total_fetched = len(timeline)
    total_tweets += total_included

    timeline = filter(must_include, timeline)
    total_included = len(timeline)

    writer.add_tweets(timeline)

    # Signal completion
    must_include(None)

    response = TwitterResponse(TwitterResponse.msg_to_status(msg),
        user_id,
        0,
        sleep_time
    )

    if total_fetched >= 2:
        screen_name = timeline[0]['user']['screen_name']
        first_tweet = timeline[0]['text'].replace('\n', '').replace('\r', '').replace('\t', '').encode('utf8')
        last_tweet = timeline[-1]['text'].replace('\n', '').replace('\r', '').replace('\t', '').encode('utf8')

        # TODO: We could add some statics like the number of hashtags and so on.
        # but may be we could exploits the pub/sub architecture. Other option is
        # to use directly the must_follow callback to collect statistics
        log.msg("Got %d tweets for user_id %d screen_name %s" % (total_fetched, user_id, screen_name))
        log.msg("  First tweet: '%s'" % first_tweet)
        log.msg("  Last tweet:  '%s'" % last_tweet)

    response['timeline.total_included'] = total_included
    response['timeline.total_fetched'] = total_fetched

    if response.status != STATUS_ERROR and total_fetched > 0:
        writer.commit()

    return response
Example #2
0
def update_timeline(user_id, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """
    # Here we just need to load the first line and get the id_str of the first
    # tweet. We are also assuming the file is there and accassible

    log.msg("Downloading updates of user_id %d" % user_id)

    writer = TimelineFile(user_id)

    abort = False

    try:
        abort = (writer.get_total() == 0)
        first_tweet = writer.get_first()
        since_id = int(first_tweet['id_str'])
    except Exception, exc:
        abort = True
Example #3
0
def update_timeline(user_id, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """
    # Here we just need to load the first line and get the id_str of the first
    # tweet. We are also assuming the file is there and accassible

    log.msg("Downloading updates of user_id %d" % user_id)

    writer = TimelineFile(user_id)

    abort = False

    try:
        abort = (writer.get_total() == 0)
        first_tweet = writer.get_first()
        since_id = int(first_tweet['id_str'])
    except Exception, exc:
        abort = True
Example #4
0
def crawl_timeline(user_id, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """

    log.msg("Fetching timeline of user_id %d" % user_id)

    writer = TimelineFile(user_id)

    max_id = ''
    total_tweets = writer.get_total()

    try:
        last_tweet_id = int(writer.get_last()['id_str']) - 1
    except:
        log.msg("This seems to be a new timeline file")
        last_tweet_id = -1

    msg, timeline, sleep_time = fetch_timeline(user_id=user_id,
                                               last_tweet_id=last_tweet_id)

    total_included = 0
    total_fetched = len(timeline)
    total_tweets += total_included

    timeline = filter(must_include, timeline)
    total_included = len(timeline)

    writer.add_tweets(timeline)

    # Signal completion
    must_include(None)

    response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, 0,
                               sleep_time)

    if total_fetched >= 2:
        screen_name = timeline[0]['user']['screen_name']
        first_tweet = timeline[0]['text'].replace('\n', '').replace(
            '\r', '').replace('\t', '').encode('utf8')
        last_tweet = timeline[-1]['text'].replace('\n', '').replace(
            '\r', '').replace('\t', '').encode('utf8')

        # TODO: We could add some statics like the number of hashtags and so on.
        # but may be we could exploits the pub/sub architecture. Other option is
        # to use directly the must_follow callback to collect statistics
        log.msg("Got %d tweets for user_id %d screen_name %s" %
                (total_fetched, user_id, screen_name))
        log.msg("  First tweet: '%s'" % first_tweet)
        log.msg("  Last tweet:  '%s'" % last_tweet)

    response['timeline.total_included'] = total_included
    response['timeline.total_fetched'] = total_fetched

    if response.status != STATUS_ERROR and total_fetched > 0:
        writer.commit()

    return response