def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid, ), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers') # By adding '--soft-time-limit secs' when you start celery, this will resend task to broker # e.g. celery -A tasks.workers -Q user_crawler worker -l info -c 1 --soft-time-limit 10 except SoftTimeLimitExceeded: crawler.error("user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) app.send_task('tasks.user.crawl_person_infos', args=(uid, ), queue='user_crawler', routing_key='for_user_info')
def crawl_follower_fans(uid): seed = SeedidsOper.get_seed_by_id(uid) if seed.other_crawled == 0: rs = get_fans_or_followers_ids(uid, 1, 1) rs.extend(get_fans_or_followers_ids(uid, 2, 1)) datas = set(rs) # If data already exits, just skip it if datas: SeedidsOper.insert_seeds(datas) SeedidsOper.set_seed_other_crawled(uid)
def crawl_follower_fans(uid): seed = SeedidsOper.get_seed_by_id(uid) if seed.other_crawled == 0: rs = get_fans_or_followers_ids(uid, 1, 1) rs.extend(get_fans_or_followers_ids(uid, 2, 1)) datas = set(rs) # If data already exits, just skip it if datas: SeedidsOper.insert_seeds(datas) SeedidsOper.set_seed_other_crawled(uid)
def crawl_follower_fans(uid): user, is_crawled = get_profile(uid) if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return rs = get_fans_or_followers_ids(uid, 1, 1) rs.extend(get_fans_or_followers_ids(uid, 2, 1)) datas = set(rs) for uid in datas: get_profile(uid) # If data already exits, just skip it # if datas: # SeedidsOper.insert_seeds(datas) SeedidsOper.set_seed_other_crawled(uid)
def crawl_person_infos(uid): if not uid: return try: user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: crawl_follower_fans(uid) except SoftTimeLimitExceeded: crawler.error( "user SoftTimeLimitExceeded uid={uid}".format(uid=uid)) crawl_person_infos(uid)
def crawl_person_infos(uid): """ Crawl user info and their fans and followers For the limit of weibo's backend, we can only crawl 5 pages of the fans and followers. We also have no permissions to view enterprise's followers and fans info :param uid: current user id :return: None """ if not uid: return user, is_crawled = get_profile(uid) # If it's enterprise user, just skip it if user and user.verify_type == 2: SeedidsOper.set_seed_other_crawled(uid) return # Crawl fans and followers if not is_crawled: app.send_task('tasks.user.crawl_follower_fans', args=(uid,), queue='fans_followers', routing_key='for_fans_followers')