def snowball_following(poi_db, net_db, level, check='N'): #Processing max 200 users each time. start_level = level while True: count = poi_db.count({'level': start_level, 'protected': False, 'following_scrape_flag': {'$exists': False}}) if count == 0: return False else: # print 'have user', count for user in poi_db.find({'level': start_level, 'protected': False, 'following_scrape_flag': {'$exists': False}}, ['id_str']).limit(min(200, count)): # print 'a new user' next_cursor = -1 params = {'user_id': user['id_str'], 'count': 5000, 'stringify_ids':True} # followee getting while next_cursor != 0: params['cursor'] = next_cursor followees = get_followings(params) if followees: followee_ids = followees['ids'] list_size = len(followee_ids) length = int(math.ceil(list_size/100.0)) # print length print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Process followings', list_size, 'for user', user['id_str'] for index in xrange(length): index_begin = index*100 index_end = min(list_size, index_begin+100) profiles = lookup.get_users_info(followee_ids[index_begin:index_end]) print 'user prof:', index_begin, index_end, len(profiles) # if profiles: for profile in profiles: check_flag = profiles_check.check_user(profile, check) if check_flag: profile['following_prelevel_node'] = user['id_str'] profile['level'] = start_level+1 try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass try: net_db.insert({'user': int(profile['id_str']), 'follower': int(user['id_str']), 'scraped_at': datetime.datetime.now()}) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = followees['next_cursor'] else: break poi_db.update_one({'id': int(user['id_str'])}, {'$set':{"following_scrape_flag": True }}, upsert=False) return True
def get_tweet_retweeters(tweet_id, poi_db, check='N'): next_cursor = -1 params = {'id': tweet_id, 'stringify_ids': True} # followee getting while next_cursor != 0: params['cursor'] = next_cursor retweeters = get_retweeters(params) if retweeters: retweeter_ids = retweeters['ids'] print 'Retweeters size', len(retweeter_ids) profiles = lookup.get_users_info(retweeter_ids) # if profiles: for profile in profiles: check_flag = profiles_check.check_user(profile, check) if check_flag: try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = retweeters['next_cursor']
def snowball_follower(poi_db, net_db, level, check='N'): #Processing max 200 users each time. start_level = level while True: count = poi_db.find_one({ 'level': start_level, 'protected': False, 'follower_scrape_flag': { '$exists': False } }) if count is None: return False else: for user in poi_db.find( { 'level': start_level, 'protected': False, 'follower_scrape_flag': { '$exists': False } }, ['id_str']).limit(200): next_cursor = -1 params = { 'user_id': user['id_str'], 'count': 5000, 'stringify_ids': True } # follower getting while next_cursor != 0: params['cursor'] = next_cursor followers = get_followers(params) if followers: follower_ids = followers['ids'] list_size = len(follower_ids) length = int(math.ceil(list_size / 100.0)) # print length print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S" ), 'Process followers', list_size, 'for user', user[ 'id_str'] for index in xrange(length): index_begin = index * 100 index_end = min(list_size, index_begin + 100) profiles = lookup.get_users_info( follower_ids[index_begin:index_end]) if profiles: print 'user prof:', index_begin, index_end, len( profiles) for profile in profiles: check_flag = profiles_check.check_user( profile, check) if check_flag: profile[ 'follower_prelevel_node'] = user[ 'id_str'] profile['level'] = start_level + 1 try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass try: net_db.insert({ 'user': int(user['id_str']), 'follower': int(profile['id_str']), 'scraped_at': datetime.datetime.now(). strftime( '%a %b %d %H:%M:%S +0000 %Y' ) }) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = followers['next_cursor'] else: break poi_db.update_one({'id': int(user['id_str'])}, {'$set': { "follower_scrape_flag": True }}, upsert=False) continue
def snowball_following_proportion(poi_db, net_db, level, check='N', proportation=0.1): #Processing max 200 users each time., only retrieve 10% followings start_level = level while True: count = poi_db.find_one({ 'level': start_level, 'protected': False, 'following_scrape_flag': { '$exists': False } }) if count is None: return False else: # print 'have user', count for user in poi_db.find( { 'level': start_level, 'protected': False, 'following_scrape_flag': { '$exists': False } }, ['id_str', 'friends_count']).limit(200): # print 'a new user' following_limit = int(user['friends_count'] * proportation) next_cursor = -1 params = {'user_id': user['id_str'], 'stringify_ids': True} # followee getting while next_cursor != 0 and following_limit > 0: params['cursor'] = next_cursor print user['id_str'], ' following limit ', following_limit params['count'] = min(following_limit, 5000) followees = get_followings(params) if followees: followee_ids = followees['ids'] list_size = len(followee_ids) following_limit -= list_size length = int(math.ceil(list_size / 100.0)) # print length print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S" ), 'Process followings', list_size, 'for user', user[ 'id_str'] for index in xrange(length): index_begin = index * 100 index_end = min(list_size, index_begin + 100) profiles = lookup.get_users_info( followee_ids[index_begin:index_end]) if profiles: print 'user prof:', index_begin, index_end, len( profiles) for profile in profiles: check_flag = profiles_check.check_user( profile, check) if check_flag: profile[ 'following_prelevel_node'] = user[ 'id_str'] profile['level'] = start_level + 1 probablity = float( user['friends_count'] + 1) / ( profile['followers_count'] + 1) # probablity = 1.0/profile['followers_count'] randomv = random.uniform(0, 1) if randomv <= probablity: try: poi_db.insert(profile) except pymongo.errors.DuplicateKeyError: pass try: net_db.insert({ 'user': int(profile['id_str']), 'follower': int(user['id_str']), 'scraped_at': datetime.datetime.now() }) except pymongo.errors.DuplicateKeyError: pass # prepare for next iterator next_cursor = followees['next_cursor'] else: break poi_db.update_one({'id': int(user['id_str'])}, {'$set': { "following_scrape_flag": True }}, upsert=False) return True