def crawl(users_collection, edges_collection, user_ids, twitter_api, depth=1, percentage=1, sample_more=False, get_friends=False, get_followers=True): """ For each user in `user_ids`, gets all followers_ids and friends_ids and stores the edges in db. If 'get_followers' is True, it then also store a `percentage` sample of each user's followers in the db. If 'get_friends' is True, it then also store a `percentage` sample of each user's friends in the db. Do this for `depth` recursive iterations for each user that is stored in the database. The last level of users stored in the database will have edges for their friends/followers, but those won't be sampled and fetched to db. """ users, code = call_with_error_handling(ensure_users_in_db, user_ids, users_collection, twitter_api) if code != 0: logging.warn("Some error looking up some users, code {}".format(code)) #TODO how should this be dealt with? does twitter return the OK ones or nothing? #IDEA users = ensure_users_one_by_one(user_ids, users_collection, twitter_api) for user in users: if 'sampled_followers' in user and user[ 'sampled_followers'] and not sample_more: logging.info( ".. already sampled this user's followers. moving on.") continue ids_tup, code = call_with_error_handling(ensure_users_edges_in_db, user, edges_collection, twitter_api) if code != 0: logging.warn( ".. Some problem getting user {0}'s followers. Maybe she's protected or something. Skipping." .format(user['id'])) continue friends_ids, followers_ids = ids_tup user['sampled_followers'] = True users_collection.save(user) other_user_ids = [] if get_friends: other_user_ids += random.sample( friends_ids, int(math.ceil(percentage * len(friends_ids)))) if get_followers: other_user_ids += random.sample( followers_ids, int(math.ceil(percentage * len(followers_ids)))) if depth > 0 and other_user_ids: crawl(users_collection, edges_collection, other_user_ids, twitter_api, depth - 1, percentage, sample_more, get_friends, get_followers)
def crawl(users_collection, edges_collection, user_ids, twitter_api, depth=1, percentage=1, sample_more=False, get_friends=False, get_followers=True): """ For each user in `user_ids`, gets all followers_ids and friends_ids and stores the edges in db. If 'get_followers' is True, it then also store a `percentage` sample of each user's followers in the db. If 'get_friends' is True, it then also store a `percentage` sample of each user's friends in the db. Do this for `depth` recursive iterations for each user that is stored in the database. The last level of users stored in the database will have edges for their friends/followers, but those won't be sampled and fetched to db. """ users, code = call_with_error_handling(ensure_users_in_db, user_ids, users_collection, twitter_api) if code != 0: logging.warn("Some error looking up some users, code {}".format(code)) #TODO how should this be dealt with? does twitter return the OK ones or nothing? #IDEA users = ensure_users_one_by_one(user_ids, users_collection, twitter_api) for user in users: if 'sampled_followers' in user and user['sampled_followers'] and not sample_more: logging.info(".. already sampled this user's followers. moving on.") continue ids_tup, code = call_with_error_handling(ensure_users_edges_in_db, user, edges_collection, twitter_api) if code != 0: logging.warn(".. Some problem getting user {0}'s followers. Maybe she's protected or something. Skipping.".format(user['id'])) continue friends_ids, followers_ids = ids_tup user['sampled_followers'] = True users_collection.save(user) other_user_ids = [] if get_friends: other_user_ids += random.sample(friends_ids, int(math.ceil(percentage * len(friends_ids)))) if get_followers: other_user_ids += random.sample(followers_ids, int(math.ceil(percentage * len(followers_ids)))) if depth > 0 and other_user_ids: crawl(users_collection, edges_collection, other_user_ids, twitter_api, depth-1, percentage, sample_more, get_friends, get_followers)
def get_followers_ids(api, user_id): """ Given a Tweepy/smappPy TweepyPool api, query twitter's rest API for followers of given user_id. Returns IDs only (much faster / more per request). Parameters: api - fully authenticated Tweepy api or smappPy TweepyPool api user_id - twitter user id Returns tuple: return code, list of IDs or None (if API call fails) """ cursor = Cursor(api.followers_ids, user_id=user_id) user_list, ret_code = call_with_error_handling(list, cursor.items()) if ret_code != 0: logger.warning("User {0}: Followers request failed".format(user_id)) # Return user list from API or None (call_with_error_handling returns None if # call fail) return ret_code, user_list
def get_followers_ids(api, user_id): """ Given a Tweepy/smappPy TweepyPool api, query twitter's rest API for followers of given user_id. Returns IDs only (much faster / more per request). Parameters: api - fully authenticated Tweepy api or smappPy TweepyPool api user_id - twitter user id Returns tuple: return code, list of IDs or None (if API call fails) """ cursor = Cursor(api.followers_ids, user_id=user_id) user_list, ret_code = call_with_error_handling(list, cursor.items()) if ret_code != 0: logger.warning("User {0}: Followers request failed".format(user_id)) # Return user list from API or None (call_with_error_handling returns None if # call fail) return ret_code, user_list
def populate_user_tweets(api, user_collection, tweet_collection, tweets_per_user, ensure_indexes=True, requery=True, update_threshold=None): """ Iterates through user_collection, querying Twitter API for last 'tweets_per_user' tweets. Considers last tweet fetched for each user. Updates user access time and last tweet fetched. Calculates and stores user tweet frequency. If requery is False, does not query for tweets of user that already has tweet ids in 'tweet_ids' field. """ if ensure_indexes: logger.info("Ensuring indexes on tweet collection") create_tweet_indexes(tweet_collection) # Get DB cursor over users according to parameters if update_threshold: users = user_collection.find({"$or": [ {"tweets_updated": {"$lt": update_threshold}}, {"tweets_updated": {"$type": BSON_NULL}} ]}, no_cursor_timeout=True,) # Remove sort for now. Can not execute on field without index # sort=[("tweets_updated", ASCENDING)]) else: users = user_collection.find(no_cursor_timeout=True,) # Remove sort (see above) # sort=[("tweets_updated", ASCENDING)]) logger.info("Considering {0} users total".format(users.count(with_limit_and_skip=True))) # Iterate over users, attempting to fetch and store tweets for each for user in users: logger.info("Considering user {0}".format(user["id"])) # Check requery and user tweets. If requery False and user has tweets, skip user if not requery and user["tweet_ids"]: logger.debug(".. User {0} has tweets, not re-querying".format(user["id"])) continue if user["latest_tweet_id"]: cursor = tweepy.Cursor(api.user_timeline, user_id=user["id"], since_id=user["latest_tweet_id"], include_rts=True) else: cursor = tweepy.Cursor(api.user_timeline, user_id=user["id"], include_rts=True) # While return is error, keep trying to get tweets depending on error type. # If error not well-understood, move on to next user return_code = -1 while return_code != 0: tweets, return_code = call_with_error_handling(list, cursor.items(tweets_per_user)) # User no longer exists. Move on if return_code == 34: logger.warn(".. User {0} no longer exists, skipping".format(user["id"])) break elif return_code == 179: logger.warn(".. User {0}'s account is private, skipping".format(user["id"])) break elif return_code != 0: logger.warn(".. Error {0} for user {1}, skipping".format(return_code, user["id"])) break # Do a final check of tweet population. If None, there was an error that waiting # and retrying could not fix. If tweets is merely an empty list, still want to update # user's 'updated_timestamp' field. if tweets == None: continue # Reverse tweets when storing (given order is newest to oldest) saved_tweet_ids = [] for tweet in tweets[::-1]: saved_id = save_tweet(tweet_collection, tweet) if saved_id: saved_tweet_ids.append(saved_id) # Calculate frequency if len(tweets) < 2: frequency = 0 else: first_tweet_date = tweets[-1].created_at last_tweet_date = tweets[0].created_at frequency = len(tweets) / float((last_tweet_date - first_tweet_date).days or 1) latest_tweet_id = tweets[0].id if tweets else None update_user(user_collection, user, latest_tweet_id, frequency, saved_tweet_ids) logger.info(".. {0} tweets found, {1} saved".format(len(tweets), len(saved_tweet_ids)))
def populate_user_tweets(api, user_collection, tweet_collection, tweets_per_user, ensure_indexes=True, requery=True, update_threshold=None): """ Iterates through user_collection, querying Twitter API for last 'tweets_per_user' tweets. Considers last tweet fetched for each user. Updates user access time and last tweet fetched. Calculates and stores user tweet frequency. If requery is False, does not query for tweets of user that already has tweet ids in 'tweet_ids' field. """ if ensure_indexes: logger.info("Ensuring indexes on tweet collection") create_tweet_indexes(tweet_collection) # Get DB cursor over users according to parameters if update_threshold: users = user_collection.find( { "$or": [{ "tweets_updated": { "$lt": update_threshold } }, { "tweets_updated": { "$type": BSON_NULL } }] }, no_cursor_timeout=True, ) # Remove sort for now. Can not execute on field without index # sort=[("tweets_updated", ASCENDING)]) else: users = user_collection.find(no_cursor_timeout=True, ) # Remove sort (see above) # sort=[("tweets_updated", ASCENDING)]) logger.info("Considering {0} users total".format( users.count(with_limit_and_skip=True))) # Iterate over users, attempting to fetch and store tweets for each for user in users: logger.info("Considering user {0}".format(user["id"])) # Check requery and user tweets. If requery False and user has tweets, skip user if not requery and user["tweet_ids"]: logger.debug(".. User {0} has tweets, not re-querying".format( user["id"])) continue if user["latest_tweet_id"]: cursor = tweepy.Cursor(api.user_timeline, user_id=user["id"], since_id=user["latest_tweet_id"], include_rts=True) else: cursor = tweepy.Cursor(api.user_timeline, user_id=user["id"], include_rts=True) # While return is error, keep trying to get tweets depending on error type. # If error not well-understood, move on to next user return_code = -1 while return_code != 0: tweets, return_code = call_with_error_handling( list, cursor.items(tweets_per_user)) # User no longer exists. Move on if return_code == 34: logger.warn(".. User {0} no longer exists, skipping".format( user["id"])) break elif return_code == 179: logger.warn( ".. User {0}'s account is private, skipping".format( user["id"])) break elif return_code != 0: logger.warn(".. Error {0} for user {1}, skipping".format( return_code, user["id"])) break # Do a final check of tweet population. If None, there was an error that waiting # and retrying could not fix. If tweets is merely an empty list, still want to update # user's 'updated_timestamp' field. if tweets == None: continue # Reverse tweets when storing (given order is newest to oldest) saved_tweet_ids = [] for tweet in tweets[::-1]: saved_id = save_tweet(tweet_collection, tweet) if saved_id: saved_tweet_ids.append(saved_id) # Calculate frequency if len(tweets) < 2: frequency = 0 else: first_tweet_date = tweets[-1].created_at last_tweet_date = tweets[0].created_at frequency = len(tweets) / float( (last_tweet_date - first_tweet_date).days or 1) latest_tweet_id = tweets[0].id if tweets else None update_user(user_collection, user, latest_tweet_id, frequency, saved_tweet_ids) logger.info(".. {0} tweets found, {1} saved".format( len(tweets), len(saved_tweet_ids)))
def populate_user_collection_from_ids(api, collection, user_ids, num_passes=2, not_found_file=None, sample=1.0): """ Populates a collection (Pymongo collection object, fully connected and authenticated) with user data from the twitter REST API endpoint /users/show (removes 'status' - user's most recent tweet). Parameters: api - Tweepy or smappPy TweepyPool API object, fully authenticated collection - Pymongo collection object, fully connected and authenticated users - Iterable of twitter user IDs to populate. Will pull totally into memory num_passes - Number of retries on UIDs failing to come in the first time not_found_file - Filename to store all user IDs not found, line separated. If None, no output sample - Proportion of users in user_ids list to populate, sampled randomly. Rounded DOWN """ # Ensure standard userdoc indexes on collection ensure_userdoc_indexes(collection) # Set up list of users not yet retrieved from Twitter and passes counter users_not_found = list(set([str(i) for i in user_ids])) if sample < 1.0: users_not_found = random.sample(users_not_found, int(len(users_not_found) * sample)) passnum = 0 # User-fetching loop while len(users_not_found) > 0 and num_passes > passnum: print "Pass {0}, attempting to find {1} users".format( passnum, len(users_not_found)) users_found_this_pass = [] for user_group in grouper(100, users_not_found, pad=False): user_list, return_code = call_with_error_handling( api.lookup_users, user_ids=user_group) if return_code == 130: print ".. Twitter over capacity. Sleeping {0} seconds".format( CAPACITY_WAIT) time.sleep(CAPACITY_WAIT) continue elif return_code != 0: print ".. Error {0}. Continuing".format(return_code) continue for user in user_list: if not user or not user._json: continue users_found_this_pass.append(str(user.id)) userdoc = create_userdoc_from_twitter_user(user._json) try: collection.save(userdoc) except DuplicateKeyError as e: print ".. User {0}: already exists in DB. Skipping".format( user.id) continue # Remove found users from users not found list users_not_found = list( set(users_not_found) - set(users_found_this_pass)) passnum += 1 # Report and finish print "Total users not found: {0}".format(len(users_not_found)) if not_found_file and len(users_not_found) > 0: print "Writing IDs not found to file: {0}".format(not_found_file) with open(not_found_file, "w") as handle: for uid in users_not_found: handle.write("{0}\n".format(uid)) print "Complete"
def populate_user_collection_from_ids(api, collection, user_ids, num_passes=2, not_found_file=None, sample=1.0): """ Populates a collection (Pymongo collection object, fully connected and authenticated) with user data from the twitter REST API endpoint /users/show (removes 'status' - user's most recent tweet). Parameters: api - Tweepy or smappPy TweepyPool API object, fully authenticated collection - Pymongo collection object, fully connected and authenticated users - Iterable of twitter user IDs to populate. Will pull totally into memory num_passes - Number of retries on UIDs failing to come in the first time not_found_file - Filename to store all user IDs not found, line separated. If None, no output sample - Proportion of users in user_ids list to populate, sampled randomly. Rounded DOWN """ # Ensure standard userdoc indexes on collection ensure_userdoc_indexes(collection) # Set up list of users not yet retrieved from Twitter and passes counter users_not_found = list(set([str(i) for i in user_ids])) if sample < 1.0: users_not_found = random.sample(users_not_found, int(len(users_not_found) * sample)) passnum = 0 # User-fetching loop while len(users_not_found) > 0 and num_passes > passnum: print "Pass {0}, attempting to find {1} users".format(passnum, len(users_not_found)) users_found_this_pass = [] for user_group in grouper(100, users_not_found, pad=False): user_list, return_code = call_with_error_handling(api.lookup_users, user_ids=user_group) if return_code == 130: print ".. Twitter over capacity. Sleeping {0} seconds".format(CAPACITY_WAIT) time.sleep(CAPACITY_WAIT) continue elif return_code != 0: print ".. Error {0}. Continuing".format(return_code) continue for user in user_list: if not user or not user._json: continue users_found_this_pass.append(str(user.id)) userdoc = create_userdoc_from_twitter_user(user._json) try: collection.save(userdoc) except DuplicateKeyError as e: print ".. User {0}: already exists in DB. Skipping".format(user.id) continue # Remove found users from users not found list users_not_found = list(set(users_not_found) - set(users_found_this_pass)) passnum += 1 # Report and finish print "Total users not found: {0}".format(len(users_not_found)) if not_found_file and len(users_not_found) > 0: print "Writing IDs not found to file: {0}".format(not_found_file) with open(not_found_file, "w") as handle: for uid in users_not_found: handle.write("{0}\n".format(uid)) print "Complete"