def crawl(users_collection,
          edges_collection,
          user_ids,
          twitter_api,
          depth=1,
          percentage=1,
          sample_more=False,
          get_friends=False,
          get_followers=True):
    """
    For each user in `user_ids`, gets all followers_ids and friends_ids and stores the edges in db.
    If 'get_followers' is True, it then also store a `percentage` sample of each user's followers in the db.
    If 'get_friends' is True, it then also store a `percentage` sample of each user's friends in the db.

    Do this for `depth` recursive iterations for each user that is stored in the database. The last level
    of users stored in the database will have edges for their friends/followers, but those won't be sampled
    and fetched to db.
    """
    users, code = call_with_error_handling(ensure_users_in_db, user_ids,
                                           users_collection, twitter_api)
    if code != 0:
        logging.warn("Some error looking up some users, code {}".format(code))
        #TODO how should this be dealt with? does twitter return the OK ones or nothing?
        #IDEA users = ensure_users_one_by_one(user_ids, users_collection, twitter_api)
    for user in users:
        if 'sampled_followers' in user and user[
                'sampled_followers'] and not sample_more:
            logging.info(
                ".. already sampled this user's followers. moving on.")
            continue

        ids_tup, code = call_with_error_handling(ensure_users_edges_in_db,
                                                 user, edges_collection,
                                                 twitter_api)
        if code != 0:
            logging.warn(
                ".. Some problem getting user {0}'s followers. Maybe she's protected or something. Skipping."
                .format(user['id']))
            continue
        friends_ids, followers_ids = ids_tup
        user['sampled_followers'] = True
        users_collection.save(user)

        other_user_ids = []
        if get_friends:
            other_user_ids += random.sample(
                friends_ids, int(math.ceil(percentage * len(friends_ids))))
        if get_followers:
            other_user_ids += random.sample(
                followers_ids, int(math.ceil(percentage * len(followers_ids))))

        if depth > 0 and other_user_ids:
            crawl(users_collection, edges_collection, other_user_ids,
                  twitter_api, depth - 1, percentage, sample_more, get_friends,
                  get_followers)
def crawl(users_collection, edges_collection, user_ids, twitter_api, depth=1, percentage=1, sample_more=False, get_friends=False, get_followers=True):
    """
    For each user in `user_ids`, gets all followers_ids and friends_ids and stores the edges in db.
    If 'get_followers' is True, it then also store a `percentage` sample of each user's followers in the db.
    If 'get_friends' is True, it then also store a `percentage` sample of each user's friends in the db.

    Do this for `depth` recursive iterations for each user that is stored in the database. The last level
    of users stored in the database will have edges for their friends/followers, but those won't be sampled
    and fetched to db.
    """
    users, code = call_with_error_handling(ensure_users_in_db, user_ids, users_collection, twitter_api)
    if code != 0:
        logging.warn("Some error looking up some users, code {}".format(code))
        #TODO how should this be dealt with? does twitter return the OK ones or nothing?
        #IDEA users = ensure_users_one_by_one(user_ids, users_collection, twitter_api)
    for user in users:
        if 'sampled_followers' in user and user['sampled_followers'] and not sample_more:
            logging.info(".. already sampled this user's followers. moving on.")
            continue

        ids_tup, code = call_with_error_handling(ensure_users_edges_in_db, user, edges_collection, twitter_api)
        if code != 0:
            logging.warn(".. Some problem getting user {0}'s followers. Maybe she's protected or something. Skipping.".format(user['id']))
            continue
        friends_ids, followers_ids = ids_tup
        user['sampled_followers'] = True
        users_collection.save(user)

        other_user_ids = []
        if get_friends:
            other_user_ids += random.sample(friends_ids, int(math.ceil(percentage * len(friends_ids))))
        if get_followers:
            other_user_ids += random.sample(followers_ids, int(math.ceil(percentage * len(followers_ids))))

        if depth > 0 and other_user_ids:
            crawl(users_collection, edges_collection, other_user_ids, twitter_api, depth-1, percentage, sample_more, get_friends, get_followers)
def get_followers_ids(api, user_id):
    """
    Given a Tweepy/smappPy TweepyPool api, query twitter's rest API for followers of
    given user_id. Returns IDs only (much faster / more per request).
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        user_id - twitter user id
    Returns tuple: return code, list of IDs or None (if API call fails)
    """
    cursor = Cursor(api.followers_ids, user_id=user_id)
    user_list, ret_code = call_with_error_handling(list, cursor.items())

    if ret_code != 0:
        logger.warning("User {0}: Followers request failed".format(user_id))

    # Return user list from API or None (call_with_error_handling returns None if
    # call fail)
    return ret_code, user_list
def get_followers_ids(api, user_id):
    """
    Given a Tweepy/smappPy TweepyPool api, query twitter's rest API for followers of
    given user_id. Returns IDs only (much faster / more per request).
    Parameters:
        api     - fully authenticated Tweepy api or smappPy TweepyPool api
        user_id - twitter user id
    Returns tuple: return code, list of IDs or None (if API call fails)
    """
    cursor = Cursor(api.followers_ids, user_id=user_id)
    user_list, ret_code = call_with_error_handling(list, cursor.items())

    if ret_code != 0:
        logger.warning("User {0}: Followers request failed".format(user_id))

    # Return user list from API or None (call_with_error_handling returns None if
    # call fail)
    return ret_code, user_list
Example #5
0
def populate_user_tweets(api, user_collection, tweet_collection, tweets_per_user,
    ensure_indexes=True, requery=True, update_threshold=None):
    """
    Iterates through user_collection, querying Twitter API for last 'tweets_per_user'
    tweets. Considers last tweet fetched for each user. Updates user access time and last
    tweet fetched. Calculates and stores user tweet frequency.
    If requery is False, does not query for tweets of user that already has tweet ids in
    'tweet_ids' field.
    """
    if ensure_indexes:
        logger.info("Ensuring indexes on tweet collection")
        create_tweet_indexes(tweet_collection)
    
    # Get DB cursor over users according to parameters
    if update_threshold:
        users = user_collection.find({"$or": [
                                        {"tweets_updated": {"$lt": update_threshold}},
                                        {"tweets_updated": {"$type": BSON_NULL}}
                                     ]},
                                     no_cursor_timeout=True,)
                                     # Remove sort for now. Can not execute on field without index
                                     # sort=[("tweets_updated", ASCENDING)])
    else:
        users = user_collection.find(no_cursor_timeout=True,)
                                     # Remove sort (see above)
                                     # sort=[("tweets_updated", ASCENDING)])
    logger.info("Considering {0} users total".format(users.count(with_limit_and_skip=True)))

    # Iterate over users, attempting to fetch and store tweets for each
    for user in users:
        logger.info("Considering user {0}".format(user["id"]))

        # Check requery and user tweets. If requery False and user has tweets, skip user
        if not requery and user["tweet_ids"]:
            logger.debug(".. User {0} has tweets, not re-querying".format(user["id"]))
            continue

        if user["latest_tweet_id"]:
            cursor = tweepy.Cursor(api.user_timeline, 
                                   user_id=user["id"], 
                                   since_id=user["latest_tweet_id"], 
                                   include_rts=True)
        else:
            cursor = tweepy.Cursor(api.user_timeline, 
                                   user_id=user["id"], 
                                   include_rts=True)

        # While return is error, keep trying to get tweets depending on error type.
        # If error not well-understood, move on to next user
        return_code = -1
        while return_code != 0:
            tweets, return_code = call_with_error_handling(list, cursor.items(tweets_per_user))

            # User no longer exists. Move on
            if return_code == 34:
                logger.warn(".. User {0} no longer exists, skipping".format(user["id"]))
                break
            elif return_code == 179:
                logger.warn(".. User {0}'s account is private, skipping".format(user["id"]))
                break
            elif return_code != 0:
                logger.warn(".. Error {0} for user {1}, skipping".format(return_code, user["id"]))
                break

        # Do a final check of tweet population. If None, there was an error that waiting
        # and retrying could not fix. If tweets is merely an empty list, still want to update
        # user's 'updated_timestamp' field.
        if tweets == None:
            continue

        # Reverse tweets when storing (given order is newest to oldest)
        saved_tweet_ids = []
        for tweet in tweets[::-1]:
            saved_id = save_tweet(tweet_collection, tweet)
            if saved_id:
                saved_tweet_ids.append(saved_id)

        # Calculate frequency
        if len(tweets) < 2:
            frequency = 0
        else:
            first_tweet_date = tweets[-1].created_at
            last_tweet_date = tweets[0].created_at
            frequency = len(tweets) / float((last_tweet_date - first_tweet_date).days or 1)

        latest_tweet_id = tweets[0].id if tweets else None
        update_user(user_collection, user, latest_tweet_id, frequency, saved_tweet_ids)
        logger.info(".. {0} tweets found, {1} saved".format(len(tweets), len(saved_tweet_ids)))
Example #6
0
def populate_user_tweets(api,
                         user_collection,
                         tweet_collection,
                         tweets_per_user,
                         ensure_indexes=True,
                         requery=True,
                         update_threshold=None):
    """
    Iterates through user_collection, querying Twitter API for last 'tweets_per_user'
    tweets. Considers last tweet fetched for each user. Updates user access time and last
    tweet fetched. Calculates and stores user tweet frequency.
    If requery is False, does not query for tweets of user that already has tweet ids in
    'tweet_ids' field.
    """
    if ensure_indexes:
        logger.info("Ensuring indexes on tweet collection")
        create_tweet_indexes(tweet_collection)

    # Get DB cursor over users according to parameters
    if update_threshold:
        users = user_collection.find(
            {
                "$or": [{
                    "tweets_updated": {
                        "$lt": update_threshold
                    }
                }, {
                    "tweets_updated": {
                        "$type": BSON_NULL
                    }
                }]
            },
            no_cursor_timeout=True,
        )
        # Remove sort for now. Can not execute on field without index
        # sort=[("tweets_updated", ASCENDING)])
    else:
        users = user_collection.find(no_cursor_timeout=True, )
        # Remove sort (see above)
        # sort=[("tweets_updated", ASCENDING)])
    logger.info("Considering {0} users total".format(
        users.count(with_limit_and_skip=True)))

    # Iterate over users, attempting to fetch and store tweets for each
    for user in users:
        logger.info("Considering user {0}".format(user["id"]))

        # Check requery and user tweets. If requery False and user has tweets, skip user
        if not requery and user["tweet_ids"]:
            logger.debug(".. User {0} has tweets, not re-querying".format(
                user["id"]))
            continue

        if user["latest_tweet_id"]:
            cursor = tweepy.Cursor(api.user_timeline,
                                   user_id=user["id"],
                                   since_id=user["latest_tweet_id"],
                                   include_rts=True)
        else:
            cursor = tweepy.Cursor(api.user_timeline,
                                   user_id=user["id"],
                                   include_rts=True)

        # While return is error, keep trying to get tweets depending on error type.
        # If error not well-understood, move on to next user
        return_code = -1
        while return_code != 0:
            tweets, return_code = call_with_error_handling(
                list, cursor.items(tweets_per_user))

            # User no longer exists. Move on
            if return_code == 34:
                logger.warn(".. User {0} no longer exists, skipping".format(
                    user["id"]))
                break
            elif return_code == 179:
                logger.warn(
                    ".. User {0}'s account is private, skipping".format(
                        user["id"]))
                break
            elif return_code != 0:
                logger.warn(".. Error {0} for user {1}, skipping".format(
                    return_code, user["id"]))
                break

        # Do a final check of tweet population. If None, there was an error that waiting
        # and retrying could not fix. If tweets is merely an empty list, still want to update
        # user's 'updated_timestamp' field.
        if tweets == None:
            continue

        # Reverse tweets when storing (given order is newest to oldest)
        saved_tweet_ids = []
        for tweet in tweets[::-1]:
            saved_id = save_tweet(tweet_collection, tweet)
            if saved_id:
                saved_tweet_ids.append(saved_id)

        # Calculate frequency
        if len(tweets) < 2:
            frequency = 0
        else:
            first_tweet_date = tweets[-1].created_at
            last_tweet_date = tweets[0].created_at
            frequency = len(tweets) / float(
                (last_tweet_date - first_tweet_date).days or 1)

        latest_tweet_id = tweets[0].id if tweets else None
        update_user(user_collection, user, latest_tweet_id, frequency,
                    saved_tweet_ids)
        logger.info(".. {0} tweets found, {1} saved".format(
            len(tweets), len(saved_tweet_ids)))
Example #7
0
def populate_user_collection_from_ids(api,
                                      collection,
                                      user_ids,
                                      num_passes=2,
                                      not_found_file=None,
                                      sample=1.0):
    """
    Populates a collection (Pymongo collection object, fully connected and authenticated)
    with user data from the twitter REST API endpoint /users/show (removes 'status' - user's
    most recent tweet).
    Parameters:
        api         - Tweepy or smappPy TweepyPool API object, fully authenticated 
        collection  - Pymongo collection object, fully connected and authenticated
        users       - Iterable of twitter user IDs to populate. Will pull totally into memory
        num_passes  - Number of retries on UIDs failing to come in the first time
        not_found_file - Filename to store all user IDs not found, line separated. If None, no output
        sample      - Proportion of users in user_ids list to populate, sampled randomly. Rounded DOWN
    """

    # Ensure standard userdoc indexes on collection
    ensure_userdoc_indexes(collection)

    # Set up list of users not yet retrieved from Twitter and passes counter
    users_not_found = list(set([str(i) for i in user_ids]))
    if sample < 1.0:
        users_not_found = random.sample(users_not_found,
                                        int(len(users_not_found) * sample))
    passnum = 0

    # User-fetching loop
    while len(users_not_found) > 0 and num_passes > passnum:

        print "Pass {0}, attempting to find {1} users".format(
            passnum, len(users_not_found))
        users_found_this_pass = []

        for user_group in grouper(100, users_not_found, pad=False):
            user_list, return_code = call_with_error_handling(
                api.lookup_users, user_ids=user_group)
            if return_code == 130:
                print ".. Twitter over capacity. Sleeping {0} seconds".format(
                    CAPACITY_WAIT)
                time.sleep(CAPACITY_WAIT)
                continue
            elif return_code != 0:
                print ".. Error {0}. Continuing".format(return_code)
                continue

            for user in user_list:
                if not user or not user._json:
                    continue
                users_found_this_pass.append(str(user.id))

                userdoc = create_userdoc_from_twitter_user(user._json)
                try:
                    collection.save(userdoc)
                except DuplicateKeyError as e:
                    print ".. User {0}: already exists in DB. Skipping".format(
                        user.id)
                    continue

        # Remove found users from users not found list
        users_not_found = list(
            set(users_not_found) - set(users_found_this_pass))
        passnum += 1

    # Report and finish
    print "Total users not found: {0}".format(len(users_not_found))
    if not_found_file and len(users_not_found) > 0:
        print "Writing IDs not found to file: {0}".format(not_found_file)
        with open(not_found_file, "w") as handle:
            for uid in users_not_found:
                handle.write("{0}\n".format(uid))
    print "Complete"
def populate_user_collection_from_ids(api, collection, user_ids, num_passes=2, not_found_file=None,
    sample=1.0):
    """
    Populates a collection (Pymongo collection object, fully connected and authenticated)
    with user data from the twitter REST API endpoint /users/show (removes 'status' - user's
    most recent tweet).
    Parameters:
        api         - Tweepy or smappPy TweepyPool API object, fully authenticated 
        collection  - Pymongo collection object, fully connected and authenticated
        users       - Iterable of twitter user IDs to populate. Will pull totally into memory
        num_passes  - Number of retries on UIDs failing to come in the first time
        not_found_file - Filename to store all user IDs not found, line separated. If None, no output
        sample      - Proportion of users in user_ids list to populate, sampled randomly. Rounded DOWN
    """

    # Ensure standard userdoc indexes on collection
    ensure_userdoc_indexes(collection)

    # Set up list of users not yet retrieved from Twitter and passes counter
    users_not_found = list(set([str(i) for i in user_ids]))
    if sample < 1.0:
        users_not_found = random.sample(users_not_found, int(len(users_not_found) * sample))
    passnum = 0
    
    # User-fetching loop
    while len(users_not_found) > 0 and num_passes > passnum:

        print "Pass {0}, attempting to find {1} users".format(passnum, len(users_not_found))
        users_found_this_pass = []
        
        for user_group in grouper(100, users_not_found, pad=False):
            user_list, return_code = call_with_error_handling(api.lookup_users, user_ids=user_group)
            if return_code == 130:
                print ".. Twitter over capacity. Sleeping {0} seconds".format(CAPACITY_WAIT)
                time.sleep(CAPACITY_WAIT)
                continue
            elif return_code != 0:
                print ".. Error {0}. Continuing".format(return_code)
                continue

            for user in user_list:
                if not user or not user._json:
                    continue
                users_found_this_pass.append(str(user.id))

                userdoc = create_userdoc_from_twitter_user(user._json)
                try:
                    collection.save(userdoc)
                except DuplicateKeyError as e:
                    print ".. User {0}: already exists in DB. Skipping".format(user.id)
                    continue

        # Remove found users from users not found list
        users_not_found = list(set(users_not_found) - set(users_found_this_pass))
        passnum += 1

    # Report and finish
    print "Total users not found: {0}".format(len(users_not_found))
    if not_found_file and len(users_not_found) > 0:
        print "Writing IDs not found to file: {0}".format(not_found_file)
        with open(not_found_file, "w") as handle:
            for uid in users_not_found:
                handle.write("{0}\n".format(uid))
    print "Complete"