コード例 #1
0
def crawl_followers(user_id, cursor=-1, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """

    log.msg("Retrieving followers of user_id %d" % user_id)

    # TODO: in case of duplication errors is better to open it as rw and load
    # all the users in a set thus removing duplicates

    writer = FollowerFile(user_id)
    msg, followers, sleep_time, new_cursor = fetch_followers(user_id=user_id,
                                                             cursor=cursor)

    writer.add_followers(followers)

    response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id,
                               new_cursor, sleep_time)

    response['follower.total_fetched'] = len(followers)

    if response.status != STATUS_ERROR and len(followers) > 0:
        writer.commit()

    return response
コード例 #2
0
def crawl_followers(user_id, cursor=-1, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """

    log.msg("Retrieving followers of user_id %d" % user_id)

    # TODO: in case of duplication errors is better to open it as rw and load
    # all the users in a set thus removing duplicates

    writer = FollowerFile(user_id)
    msg, followers, sleep_time, new_cursor = fetch_followers(user_id=user_id, cursor=cursor)

    writer.add_followers(followers)

    response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, new_cursor, sleep_time)

    response["follower.total_fetched"] = len(followers)

    if response.status != STATUS_ERROR and len(followers) > 0:
        writer.commit()

    return response
コード例 #3
0
ファイル: analyzer.py プロジェクト: antiface/twittomatic
def analyze_followers_of(user_id, start_cursor=0,
                         already_processed=lambda x: False,
                         must_follow=lambda x: True):

    log.msg("Analyzing followers of user_id %d" % user_id)

    reader = FollowerFile(user_id)

    if len(reader) == 0:
        log.msg("Follower file for user_id %d is not present. Bogus data?" % user_id)

        # Let's treat this as not found user
        return TwitterResponse(STATUS_UNAUTHORIZED, user_id, start_cursor, 0)

    def log_progress(lookup_infos, current, total):
        log.msg("user_id %d Follower file: analyzed %d of %d [%02d%%]" % \
                (user_id, current, total,
                 100 * (current / float(total))))

    msg, lookup_infos, sleep_time, current_line = analyze_followers(
        reader, start_cursor=start_cursor,
        already_processed=already_processed,
        progress_cb=log_progress
    )

    included = []

    for info in lookup_infos:
        if must_follow(info):
            included.append(info['id_str'])

    total_included = len(included)
    total_fetched = len(lookup_infos)

    response = TwitterResponse(TwitterResponse.msg_to_status(msg),
        user_id,
        current_line,
        sleep_time
    )

    response['analyzer.total_included'] = total_included
    response['analyzer.total_fetched'] = total_fetched
    response['analyzer.target_users'] = included

    return response
コード例 #4
0
ファイル: update.py プロジェクト: zymITsky/twittomatic
    msg, timeline, sleep_time = fetch_timeline(user_id=user_id,
                                               since_id=since_id)

    if len(timeline) > 0 and msg == MSG_OK:
        # Here we need to create a new file containing the delta timeline
        # and also include the previous tweets

        total_included = 0
        total_fetched = len(timeline)

        for tweet in timeline:
            if must_include(tweet):
                writer.add_tweet(tweet)
                total_included += 1

        response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id,
                                   0, sleep_time)

        response['update.total_included'] = total_included
        response['update.total_fetched'] = total_fetched

        writer.commit()

        return response

    # Well this is an unfortunate yet interesting situation which we decided
    # not to handle at all since. In this case the delta update is not
    # completely downloaded because may be you hit the rate limit. Therefore
    # if we write the information we collected in the file we will end up
    # having a hole in the timeline.
    # The official REST API documentation specify that you can at most download
コード例 #5
0
ファイル: update.py プロジェクト: antiface/twittomatic
    msg, timeline, sleep_time = fetch_timeline(user_id=user_id, since_id=since_id)

    if len(timeline) > 0 and msg == MSG_OK:
        # Here we need to create a new file containing the delta timeline
        # and also include the previous tweets

        total_included = 0
        total_fetched = len(timeline)

        for tweet in timeline:
            if must_include(tweet):
                writer.add_tweet(tweet)
                total_included += 1

        response = TwitterResponse(TwitterResponse.msg_to_status(msg),
            user_id,
            0,
            sleep_time
        )

        response['update.total_included'] = total_included
        response['update.total_fetched'] = total_fetched

        writer.commit()

        return response

    # Well this is an unfortunate yet interesting situation which we decided
    # not to handle at all since. In this case the delta update is not
    # completely downloaded because may be you hit the rate limit. Therefore
コード例 #6
0
def crawl_timeline(user_id, must_include=lambda x: True):
    """
    Try to download the entire timeline of the use starting from a given page.
    Before starting issuing requests the last tweet_id of the user is retrieved if present.

    @return a TwitterResponse
    """

    log.msg("Fetching timeline of user_id %d" % user_id)

    writer = TimelineFile(user_id)

    max_id = ''
    total_tweets = writer.get_total()

    try:
        last_tweet_id = int(writer.get_last()['id_str']) - 1
    except:
        log.msg("This seems to be a new timeline file")
        last_tweet_id = -1

    msg, timeline, sleep_time = fetch_timeline(user_id=user_id,
                                               last_tweet_id=last_tweet_id)

    total_included = 0
    total_fetched = len(timeline)
    total_tweets += total_included

    timeline = filter(must_include, timeline)
    total_included = len(timeline)

    writer.add_tweets(timeline)

    # Signal completion
    must_include(None)

    response = TwitterResponse(TwitterResponse.msg_to_status(msg), user_id, 0,
                               sleep_time)

    if total_fetched >= 2:
        screen_name = timeline[0]['user']['screen_name']
        first_tweet = timeline[0]['text'].replace('\n', '').replace(
            '\r', '').replace('\t', '').encode('utf8')
        last_tweet = timeline[-1]['text'].replace('\n', '').replace(
            '\r', '').replace('\t', '').encode('utf8')

        # TODO: We could add some statics like the number of hashtags and so on.
        # but may be we could exploits the pub/sub architecture. Other option is
        # to use directly the must_follow callback to collect statistics
        log.msg("Got %d tweets for user_id %d screen_name %s" %
                (total_fetched, user_id, screen_name))
        log.msg("  First tweet: '%s'" % first_tweet)
        log.msg("  Last tweet:  '%s'" % last_tweet)

    response['timeline.total_included'] = total_included
    response['timeline.total_fetched'] = total_fetched

    if response.status != STATUS_ERROR and total_fetched > 0:
        writer.commit()

    return response