class GetSecUserIds(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_aweme_lists(self): offset = 20 aweme_lists = [] awemes = self.redis_client.get_feigua_awemes() for i in range(0, len(awemes), offset): aweme_list = awemes[i:i+offset] aweme_lists.append(aweme_list) return aweme_lists def get_clips(self, aweme_list): sec_user_id_list = [] aweme_id_list = [] aweme_int_list = [int(aweme) for aweme in aweme_list] try: raw_data = self.get_raw_data.get_clips(aweme_int_list) except Exception as e: logger.error('get_clips出错-' + e.args[0]) return None if raw_data.get('status_code') == 2053: logger.info('这组没有视频') else: data = raw_data.get('aweme_details') for each in data: aweme_id = each.get('aweme_id') sec_user_id = each.get('author').get('sec_uid') if sec_user_id: aweme_id_list.append(aweme_id) sec_user_id_list.append(sec_user_id) for each in aweme_id_list: self.redis_client.delete_feigua_awemes(each) for each in sec_user_id_list: self.redis_client.add_pre_users(each, -1) def run(self): aweme_lists = self.get_aweme_lists() logger.info('共有feigua_aweme组数:' + str(len(aweme_lists))) batch_size = 1 for batch_limit in range(0, len(aweme_lists), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(aweme_lists)) logger.info('get_clips爬取当前feigua_aweme组序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_clips, aweme_list) for aweme_list in aweme_lists[start:stop]] gevent.joinall(tasks)
class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_users(self): users = self.redis_client.get_pre_users(-1, -1) return users def check_commercial(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) == 0: #确实获取到了页面,promotion仍没有,那就真的不带货了 logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id) self.redis_client.delete_pre_users(sec_user_id) else: #prrmotion是有的,说明带货,那就状态改为0 self.redis_client.add_pre_users(sec_user_id, 0) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): users = self.get_users() logger.info('共有待确认是否带货用户数量:' + str(len(users))) batch_size = 50 #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('check_commercial爬取当前用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.check_commercial, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class GetPreUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = STUPID_KEY_WORDS def count_following(self): return self.redis_client.count_following() def count_pre_users(self): return self.redis_client.count_pre_users() def get_following(self): batch = [] for i in range(self.batch_size): batch.append(self.redis_client.get_following()) return batch def add_following_and_pre_users(self): for pre_user in self.pre_user_list: if self.count_following() < 100000: self.redis_client.add_following(pre_user) if self.count_pre_users() < 500000: self.redis_client.add_pre_users(pre_user) def is_qualified_user(self, user): if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason'): return False if user.get('custom_verify'): if not '自媒体' in user.get('custom_verify'): if not '主播' in user.get('custom_verify'): if not '视频' in user.get('custom_verify'): return False nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_pre_users(self, sec_user_id): try: raw_data = self.get_raw_data.get_following(sec_user_id) except Exception as e: logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if not raw_data.get('status_code') == 2096: following_list = raw_data.get('followings') if len(following_list) == 0: logger.error('获取不到数据了,程序退出') #sys.exit() for user in following_list: if self.is_qualified_user(user): self.pre_user_list.append(user.get('sec_uid')) else: logger.info('关注不可见-sec_user_id-' + sec_user_id) def run(self): if self.count_following() < 100000 or self.count_pre_users() < 500000: batch = self.get_following() tasks = [ gevent.spawn(self.get_pre_users, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) logger.info('获取到pre_user-' + str(len(self.pre_user_list))) self.add_following_and_pre_users() self.pre_user_list.clear() else: logger.info('已经有太多following或pre_users了')
class GetPreUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = [ '公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌', '汇', '馆', '裤', '业', '专', '卖' ] def get_following(self): batch = [] for i in range(self.batch_size): batch.append(self.redis_client.get_following()) return batch def add_following_and_pre_users(self): for pre_user in self.pre_user_list: self.redis_client.add_following(pre_user) self.redis_client.add_pre_users(pre_user, -1) def is_qualified_user(self, user): if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason') != '': return False if user.get('custom_verify') != '': return False nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_pre_users(self, sec_user_id): try: raw_data = self.get_raw_data.get_following(sec_user_id) except Exception as e: logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if not raw_data.get('status_code') == 2096: following_list = raw_data.get('followings') for user in following_list: if self.is_qualified_user(user): self.pre_user_list.append(user.get('sec_uid')) else: logger.info('关注不可见-sec_user_id-' + sec_user_id) def run(self): batch = self.get_following() tasks = [ gevent.spawn(self.get_pre_users, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) logger.info('获取到pre_user-' + str(len(self.pre_user_list))) self.add_following_and_pre_users() self.pre_user_list.clear()