class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 50 def get_pre_users(self): batch = [] while len(batch) < self.batch_size: pre_user = self.redis_client.get_pre_users() if not self.is_user(pre_user): #如果这个pre_user在user表中还不存在 batch.append(pre_user) return batch def count_pre_users(self): return self.redis_client.count_pre_users() def is_user(self, sec_user_id): return self.redis_client.is_user(sec_user_id) def check_qualification_by_promotion(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) > 10: #确实获取到了页面,promotion大于10 self.redis_client.add_users(sec_user_id) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): if self.count_pre_users() > 0: batch = self.get_pre_users() tasks = [ gevent.spawn(self.check_qualification_by_promotion, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) else: logger.info('pre_users列表空了,程序退出') sys.exit()
class GetPreUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = STUPID_KEY_WORDS def count_following(self): return self.redis_client.count_following() def count_pre_users(self): return self.redis_client.count_pre_users() def get_following(self): batch = [] for i in range(self.batch_size): batch.append(self.redis_client.get_following()) return batch def add_following_and_pre_users(self): for pre_user in self.pre_user_list: if self.count_following() < 100000: self.redis_client.add_following(pre_user) if self.count_pre_users() < 500000: self.redis_client.add_pre_users(pre_user) def is_qualified_user(self, user): if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason'): return False if user.get('custom_verify'): if not '自媒体' in user.get('custom_verify'): if not '主播' in user.get('custom_verify'): if not '视频' in user.get('custom_verify'): return False nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_pre_users(self, sec_user_id): try: raw_data = self.get_raw_data.get_following(sec_user_id) except Exception as e: logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if not raw_data.get('status_code') == 2096: following_list = raw_data.get('followings') if len(following_list) == 0: logger.error('获取不到数据了,程序退出') #sys.exit() for user in following_list: if self.is_qualified_user(user): self.pre_user_list.append(user.get('sec_uid')) else: logger.info('关注不可见-sec_user_id-' + sec_user_id) def run(self): if self.count_following() < 100000 or self.count_pre_users() < 500000: batch = self.get_following() tasks = [ gevent.spawn(self.get_pre_users, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) logger.info('获取到pre_user-' + str(len(self.pre_user_list))) self.add_following_and_pre_users() self.pre_user_list.clear() else: logger.info('已经有太多following或pre_users了')