class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 50 def get_pre_users(self): batch = [] while len(batch) < self.batch_size: pre_user = self.redis_client.get_pre_users() if not self.is_user(pre_user): #如果这个pre_user在user表中还不存在 batch.append(pre_user) return batch def count_pre_users(self): return self.redis_client.count_pre_users() def is_user(self, sec_user_id): return self.redis_client.is_user(sec_user_id) def check_qualification_by_promotion(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) > 10: #确实获取到了页面,promotion大于10 self.redis_client.add_users(sec_user_id) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): if self.count_pre_users() > 0: batch = self.get_pre_users() tasks = [ gevent.spawn(self.check_qualification_by_promotion, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) else: logger.info('pre_users列表空了,程序退出') sys.exit()
class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_users(self): users = self.redis_client.get_pre_users(-1, -1) return users def check_commercial(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) == 0: #确实获取到了页面,promotion仍没有,那就真的不带货了 logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id) self.redis_client.delete_pre_users(sec_user_id) else: #prrmotion是有的,说明带货,那就状态改为0 self.redis_client.add_pre_users(sec_user_id, 0) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): users = self.get_users() logger.info('共有待确认是否带货用户数量:' + str(len(users))) batch_size = 50 #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('check_commercial爬取当前用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.check_commercial, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class GetPromotions(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.aweme_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def get_promotions(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: promotions = self.parse_promotions(raw_data, sec_user_id) except Exception as e: logger.error('parse_promotions错误-' + e.args + '-sec_user_id-' + sec_user_id) return None self.write_to_file(json.dumps(promotions, ensure_ascii=False)) def parse_promotions(self, raw_data, sec_user_id): promotions = [] data = raw_data.get('promotions') if data == []: logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id) self.redis_client.delete_users(sec_user_id) return None else: for each in data: promotion = {} #promotion['user_id'] = user_id promotion['sec_user_id'] = sec_user_id promotion['price'] = each.get('price') / 100 promotion['cover_url'] = each.get('images')[0].get( 'url_list')[0] promotion['title'] = each.get('title') promotion['product_id'] = each.get('product_id') promotion['product_url'] = each.get('detail_url') promotion['min_price'] = str(int(each.get('min_price')) / 100) promotion['douyin_sales'] = each.get('sales') promotion['product_source'] = each.get('goods_source') promotion['create_time'] = str(int(time.time())) if each.get('market_price'): promotion['market_price'] = each.get('market_price') / 100 if each.get('last_aweme_id'): promotion['promotion_type'] = 'video' promotion['aweme_id'] = int(each.get('last_aweme_id')) self.aweme_id_list.append(promotion['aweme_id']) else: promotion['promotion_type'] = 'picture' if each.get('taobao'): taobao = each.get('taobao') if taobao.get('coupon'): promotion['coupon_amount'] = taobao.get('coupon').get( 'coupon_amount') promotion['price_after_coupon'] = promotion[ 'price'] - float(promotion['coupon_amount']) promotion['coupon_url'] = taobao.get('coupon').get( 'coupon_web_url') promotions.append(promotion) return promotions def write_to_file(self, promotions): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') harry_potter = str(random.choice(range(100))) with open(FILE_DIRECTORY + '/' + 'promotions' + '_' + harry_potter + '.txt', 'a', encoding='utf-8') as file: file.write(promotions + '\n') def save_awemes(self): for each in self.aweme_id_list: self.redis_client.add_awemes(each) def run(self): users = self.get_users() logger.info('共有用户数量:' + str(len(users))) batch_size = 50 #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('get_promotions爬取当前用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_promotions, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks) self.save_awemes() self.aweme_id_list.clear()