def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = [ '公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌', '汇', '馆', '裤', '业', '专', '卖' ]
def __init__(self): self.get_raw_data = GetRawData() #self.db = pymysql.connect(host='47.114.166.130', port=13306, user='******', password='******', db='bxdb', charset='utf8mb4') #self.cursor = self.db.cursor() self.db = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='bxmind', charset='utf8mb4') self.cursor = self.db.cursor() self.mysql_client = MysqlClient() self.redis_client = RedisClient() self.stupid_key_words = STUPID_KEY_WORDS self.a_list = [] self.b_list = []
class GetRooms(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.sec_user_id_list = [] self.room_id_list = [] def get_channel(self): try: channel_raw_data = self.get_raw_data.get_channel() except Exception as e: logger.error('get_channel出错-' + e.args[0]) return None try: self.parse_channel(channel_raw_data) except Exception as e: logger.error('parse_channel出错-' + e.args[0]) return None #logger.info(json.dumps([i[-10:-1] for i in self.sec_user_id_list])) for each in self.room_id_list: self.redis_client.add_rooms(each) for each in self.sec_user_id_list: self.redis_client.add_users(each, 1) def parse_channel(self, channel_raw_data): for each in channel_raw_data.get('data'): room_id = each.get('data').get('id_str') sec_user_id = each.get('data').get('owner').get('sec_uid') follower = each.get('data').get('owner').get('follow_info').get( 'follower_count') if follower >= 10000: try: item_list = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错-' + e.args[0]) return None if len(item_list.get('promotions')) != 0: self.room_id_list.append(room_id) self.sec_user_id_list.append(sec_user_id) def run(self): tasks = [gevent.spawn(self.get_channel) for i in range(1)] gevent.joinall(tasks) logger.info('本批次共获得room_id和sec_user_id-' + str(len(self.sec_user_id_list)) + '-' + str(len(self.room_id_list))) self.sec_user_id_list.clear() self.room_id_list.clear()
class GetUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_users(self, category_id, page): try: raw_data = self.get_raw_data.get_users(category_id, page) except Exception as e: raw_data = None logger.error('get_users错误-' + e.args + '-category_id-' + category_id + '-page-' + page) if raw_data: sec_user_id_list = self.parse_users(raw_data) self.save_to_redis(sec_user_id_list) def parse_users(self, raw_data): sec_user_id_list = [] data = raw_data.get('aweme_list') for each in data: sec_user_id = each.get('author').get('sec_uid') sec_user_id_list.append(sec_user_id) return sec_user_id_list def save_to_redis(self, sec_user_id_list): for each in sec_user_id_list: self.redis_client.add_users(each) def run(self): cate_list = range(-1, 15) for cate in cate_list: cate_page_list = [[cate, page] for page in range(0, 100)] logger.info('get_users当前爬取cate-' + str(cate)) tasks = [gevent.spawn(self.get_users, str(cate), str(page)) for cate, page in cate_page_list] gevent.joinall(tasks)
class GetCurrentRoom(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.room_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each, 0) def get_current_room(self, sec_user_id): try: raw_data = self.get_raw_data.get_current_room(sec_user_id) except Exception as e: logger.error('get_current_room出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: check = raw_data.get('data').get('pay_grade').get('grade_describe') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None own_room = raw_data.get('data').get('own_room') if own_room: #如果有这个,说明直播以及开始了 room_id = own_room.get('room_ids_str')[0] self.room_id_list.append(room_id) logger.info(sec_user_id + '-正在直播,room_id-' + room_id) else: logger.info(sec_user_id + '-未在直播') def run(self): users = self.get_users() logger.info('共有users-' + str(len(users))) batch_size = 20 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_current_room, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class GetUserDongtai(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.room_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each, 0) def get_user_dongtai(self, sec_user_id): try: raw_data = self.get_raw_data.get_user_dongtai(sec_user_id) except Exception as e: logger.error('get_user_dongtai出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: self.parse_user_dongtai(raw_data) except Exception as e: logger.error('parse_user_dongtai出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) def parse_user_dongtai(self, raw_data): data = raw_data.get('dongtai_list')[0] room_id = data.get('aweme').get('author').get('room_id') if room_id != 0: self.room_id_list.append(str(room_id)) logger.info('该主播已开始直播,room_id-' + str(room_id)) else: logger.info('该主播尚未开始直播') def run(self): users = self.get_users() logger.info('共有users-' + str(len(users))) batch_size = 20 #20个也获取不到数据 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_user_dongtai, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class GetSecUserIds(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_aweme_lists(self): offset = 20 aweme_lists = [] awemes = self.redis_client.get_feigua_awemes() for i in range(0, len(awemes), offset): aweme_list = awemes[i:i+offset] aweme_lists.append(aweme_list) return aweme_lists def get_clips(self, aweme_list): sec_user_id_list = [] aweme_id_list = [] aweme_int_list = [int(aweme) for aweme in aweme_list] try: raw_data = self.get_raw_data.get_clips(aweme_int_list) except Exception as e: logger.error('get_clips出错-' + e.args[0]) return None if raw_data.get('status_code') == 2053: logger.info('这组没有视频') else: data = raw_data.get('aweme_details') for each in data: aweme_id = each.get('aweme_id') sec_user_id = each.get('author').get('sec_uid') if sec_user_id: aweme_id_list.append(aweme_id) sec_user_id_list.append(sec_user_id) for each in aweme_id_list: self.redis_client.delete_feigua_awemes(each) for each in sec_user_id_list: self.redis_client.add_pre_users(each, -1) def run(self): aweme_lists = self.get_aweme_lists() logger.info('共有feigua_aweme组数:' + str(len(aweme_lists))) batch_size = 1 for batch_limit in range(0, len(aweme_lists), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(aweme_lists)) logger.info('get_clips爬取当前feigua_aweme组序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_clips, aweme_list) for aweme_list in aweme_lists[start:stop]] gevent.joinall(tasks)
class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 50 def get_pre_users(self): batch = [] while len(batch) < self.batch_size: pre_user = self.redis_client.get_pre_users() if not self.is_user(pre_user): #如果这个pre_user在user表中还不存在 batch.append(pre_user) return batch def count_pre_users(self): return self.redis_client.count_pre_users() def is_user(self, sec_user_id): return self.redis_client.is_user(sec_user_id) def check_qualification_by_promotion(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) > 10: #确实获取到了页面,promotion大于10 self.redis_client.add_users(sec_user_id) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): if self.count_pre_users() > 0: batch = self.get_pre_users() tasks = [ gevent.spawn(self.check_qualification_by_promotion, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) else: logger.info('pre_users列表空了,程序退出') sys.exit()
class CheckQualificationByPromotion(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_users(self): users = self.redis_client.get_pre_users(-1, -1) return users def check_commercial(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: raw_data.get('columns')[0].get('name') #表示确实获取到了页面 data = raw_data.get('promotions') if len(data) == 0: #确实获取到了页面,promotion仍没有,那就真的不带货了 logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id) self.redis_client.delete_pre_users(sec_user_id) else: #prrmotion是有的,说明带货,那就状态改为0 self.redis_client.add_pre_users(sec_user_id, 0) except Exception as e: logger.error('解析promotions页面失败-sec_user_id-' + sec_user_id + '-' + e.args[0]) def run(self): users = self.get_users() logger.info('共有待确认是否带货用户数量:' + str(len(users))) batch_size = 50 #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('check_commercial爬取当前用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.check_commercial, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient()
class GetPromotions(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.aweme_id_list = [] def get_users(self): users = self.redis_client.get_users() return users def get_promotions(self, sec_user_id): try: raw_data = self.get_raw_data.get_promotions(sec_user_id) except Exception as e: logger.error('get_promotions出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: promotions = self.parse_promotions(raw_data, sec_user_id) except Exception as e: logger.error('parse_promotions错误-' + e.args + '-sec_user_id-' + sec_user_id) return None self.write_to_file(json.dumps(promotions, ensure_ascii=False)) def parse_promotions(self, raw_data, sec_user_id): promotions = [] data = raw_data.get('promotions') if data == []: logger.info('该用户不带货,将删除,sec_user_id-' + sec_user_id) self.redis_client.delete_users(sec_user_id) return None else: for each in data: promotion = {} #promotion['user_id'] = user_id promotion['sec_user_id'] = sec_user_id promotion['price'] = each.get('price') / 100 promotion['cover_url'] = each.get('images')[0].get( 'url_list')[0] promotion['title'] = each.get('title') promotion['product_id'] = each.get('product_id') promotion['product_url'] = each.get('detail_url') promotion['min_price'] = str(int(each.get('min_price')) / 100) promotion['douyin_sales'] = each.get('sales') promotion['product_source'] = each.get('goods_source') promotion['create_time'] = str(int(time.time())) if each.get('market_price'): promotion['market_price'] = each.get('market_price') / 100 if each.get('last_aweme_id'): promotion['promotion_type'] = 'video' promotion['aweme_id'] = int(each.get('last_aweme_id')) self.aweme_id_list.append(promotion['aweme_id']) else: promotion['promotion_type'] = 'picture' if each.get('taobao'): taobao = each.get('taobao') if taobao.get('coupon'): promotion['coupon_amount'] = taobao.get('coupon').get( 'coupon_amount') promotion['price_after_coupon'] = promotion[ 'price'] - float(promotion['coupon_amount']) promotion['coupon_url'] = taobao.get('coupon').get( 'coupon_web_url') promotions.append(promotion) return promotions def write_to_file(self, promotions): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') harry_potter = str(random.choice(range(100))) with open(FILE_DIRECTORY + '/' + 'promotions' + '_' + harry_potter + '.txt', 'a', encoding='utf-8') as file: file.write(promotions + '\n') def save_awemes(self): for each in self.aweme_id_list: self.redis_client.add_awemes(each) def run(self): users = self.get_users() logger.info('共有用户数量:' + str(len(users))) batch_size = 50 #尽管异步,还是很慢,200个就很慢很慢了,慢到跟同步一样,这可能是抖音某个神奇的特点吧 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('get_promotions爬取当前用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_promotions, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks) self.save_awemes() self.aweme_id_list.clear()
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.lives_off_list = []
class SaveLiveUsers(): def __init__(self): self.get_raw_data = GetRawData() #self.db = pymysql.connect(host='47.114.166.130', port=13306, user='******', password='******', db='bxdb', charset='utf8mb4') #self.cursor = self.db.cursor() self.db = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='bxmind', charset='utf8mb4') self.cursor = self.db.cursor() self.mysql_client = MysqlClient() self.redis_client = RedisClient() self.stupid_key_words = STUPID_KEY_WORDS self.a_list = [] self.b_list = [] def into_mysql(self, data, table): keys = ','.join(data.keys()) values = ','.join(['%s'] * len(data)) sql = 'insert into %s (%s) values (%s)' %(table, keys, values) try: self.cursor.execute(sql, tuple(data.values())) self.db.commit() except Exception as e: print(e.args) def is_qualified(self, nickname): for word in self.stupid_key_words: if word in nickname: return False return True def run_a(self): table = 'dy_live_lives' with open('lives_20200614.txt', 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: #print(line) data = json.loads(line) nickname = data.get('nickname') sec_user_id = data.get('sec_user_id') if data.get('status') == 4: if self.is_qualified(nickname): self.into_mysql(data, table) else: self.redis_client.delete_users(sec_user_id) self.redis_client.delete_live_users(sec_user_id) def replicate_table(self): sql = 'CREATE TABLE dy_sample LIKE dy_live_lives' self.cursor.execute(sql) def select_users(self): sql = 'SELECT room_id, sec_user_id, nickname, short_id, total_viewer, like_count, follower_count, signature, city FROM dy_live_lives WHERE total_viewer > 50000 AND follower_count > 500000' self.cursor.execute(sql) row = self.cursor.fetchone() while row: data = {} data['room_id'] = row[0] data['sec_user_id'] = row[1] data['nickname'] = row[2] data['short_id'] = row[3] data['total_viewer'] = row[4] data['like_count'] = row[5] data['follower_count'] = row[6] data['signature'] = row[7] data['city'] = row[8] self.a_list.append(data) row = self.cursor.fetchone() def get_cates(self, data): sec_user_id = data['sec_user_id'] try: cates_raw_data = self.get_raw_data.get_cates(sec_user_id) except Exception as e: logger.error('get_cates出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None cate_list = cates_raw_data.get('user_shop_categories') for each in cate_list: cate = each['name'] number = each['count'] if cate in ['零食', '食品', '花茶', '果茶'] and number >= 3: self.b_list.append(data) break def run_b(self): self.select_users() logger.info('a_list共有数据-' + str(len(self.a_list))) batch_size = 100 for batch_limit in range(0, len(self.a_list), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(self.a_list)) logger.info('当前爬取用户序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_cates, data) for data in self.a_list[start:stop]] gevent.joinall(tasks) logger.info('b_list共有数据-' + str(len(self.b_list))) for data in self.b_list: self.into_mysql(data, 'dy_sample') def run_c(self): self.select_users() for data in self.a_list: self.into_mysql(data, 'dy_sample') def select_rooms(self): room_list = [] sql = 'SELECT room_id FROM dy_sample' self.cursor.execute(sql) row = self.cursor.fetchone() while row: room_list.append(row[0]) row = self.cursor.fetchone() return room_list def get_txt(self): with open('lives_20200605.txt', 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: print(line) break def write_to_file(self, item_list): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open (FILE_DIRECTORY + '/' + 'item_lists_sample'+ '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(item_list + '\n') def run_d(self): url = 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=606547898363&sellerId=2206709156233&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,activity,fqg,zjys,couponActivity,soldQuantity,page,originalPrice,tradeContract&callback=onSibRequestSuccess' headers = { 'Referer': 'https://item.taobao.com/item.htm?id=606547898363', 'Sec-Fetch-Mode': 'no-cors', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' } response = requests.get(url, headers=headers, allow_redirects=False) print(response.text) def run_e(self): file = 'lives_20200609.txt' table = 'dy_live_lives' data_batch = [] batch_size = 200 loop = asyncio.get_event_loop() task = loop.create_task(self.mysql_client.connect_mysql(loop)) loop.run_until_complete(task) with open(file, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line = json.loads(line) if self.is_qualified(line.get('nickname')): if line.get('status') == 4: data_batch.append(line) #最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久#最初数据没有入库然后程序很快就结束,是因为,4写成了’4‘。然后我以为其他地方有问题,就去打印,然后由于颜文字,打印出错(颜文字不会给入库造成错误),就在颜文字上卡了很久 line.pop('mobile') if len(data_batch) >= batch_size: tasks = [self.mysql_client.into_mysql(loop, i, table) for i in data_batch] loop.run_until_complete(asyncio.wait(tasks)) #tasks = [gevent.spawn(self.into_mysql, line) for line in data_batch] #gevent.joinall(tasks) data_batch.clear() else: self.redis_client.delete_users(line.get('sec_user_id')) self.redis_client.delete_live_users(line.get('sec_user_id')) print('删除user', line.get('nickname')) def run_f(self): file_a = 'lives_20200623.txt' file_b = '第四批_抖音主播_去重前.csv' with open(file_a, 'r', encoding='utf-8') as f: with open(file_b, 'a', encoding='utf-8-sig', newline='') as g: lines = f.readlines() first_line = json.loads(lines[0]) first_line.pop('mobile') keys = list(first_line.keys()) writer = csv.DictWriter(g, fieldnames = keys) for line in lines: line = json.loads(line) if self.is_qualified(line.get('nickname')): if line.get('status') == 4: line.pop('mobile') writer.writerow(line) def run_g(self): read_workbook = xlrd.open_workbook('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后.xlsx') write_workbook = copy(read_workbook) read_sheet = read_workbook.sheet_by_name('Sheet1') write_sheet = write_workbook.get_sheet(0) nrows = read_sheet.nrows ncolumns = read_sheet.ncols for row in range(1, nrows): text = read_sheet.row(row)[15].value if text: data = re.match('.*(1\d{10}).*', str(text), re.S) if data: mobile = data.group(1) write_sheet.write(row, ncolumns, mobile) write_workbook.save('C:/Users/百芯科技/scraping/douyin7/第四批_抖音主播_去重后_电话.xlsx')
class GetUserProfile(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.stupid_key_words = STUPID_KEY_WORDS def get_users(self): users = self.redis_client.test_b() return users def get_user_profile(self, sec_user_id): try: raw_data = self.get_raw_data.get_user_profile(sec_user_id) except Exception as e: logger.error('get_user_profile出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: user_profile = self.parse_user_profile(raw_data) except Exception as e: logger.error('parse_user_profile出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if user_profile: self.write_to_file(json.dumps(user_profile, ensure_ascii=False)) self.redis_client.test_c(sec_user_id) def is_qualified_user(self, user): nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason') != '': return False if user.get('custom_verify') != '': return False if not user.get('with_fusion_shop_entry'): return False if not user.get('live_commerce'): return False if not user.get('with_commerce_entry'): return False return True def parse_user_profile(self, raw_data): data = raw_data.get('user') user_profile = {} user_profile['sec_uid'] = data.get('sec_uid') if self.is_qualified_user(data): user_profile['follower_count'] = data.get('follower_count') user_profile['nickname'] = data.get('nickname') user_profile['gender'] = data.get('gender') user_profile['location'] = data.get('location') user_profile['birthday'] = data.get('birthday') user_profile['avatar_url'] = data.get('avatar_larger').get( 'url_list')[0] user_profile['school_name'] = data.get('school_name') user_profile['signature'] = data.get('signature') user_profile['uid'] = data.get('uid') user_profile['short_id'] = data.get('short_id') user_profile['unique_id'] = data.get('unique_id') user_profile['star_atlas'] = data.get('commerce_user_info').get( 'star_atlas') user_profile['aweme_count'] = data.get('aweme_count') user_profile['dongtai_count'] = data.get('dongtai_count') user_profile['following_count'] = data.get('following_count') user_profile['favoriting_count'] = data.get('favoriting_count') user_profile['total_favorited'] = data.get('total_favorited') user_profile['live_commerce'] = data.get('live_commerce') user_profile['create_time'] = str(int(time.time())) return user_profile else: self.redis_client.delete_users(user_profile['sec_uid']) #print(json.dumps(raw_data, ensure_ascii=False)) logger.info('删除user-sec_user_id-' + user_profile['sec_uid']) return None def write_to_file(self, user_profile): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open(FILE_DIRECTORY + '/' + 'user_profiles' + '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(user_profile + '\n') def run(self): users = self.get_users() logger.info('共有users-' + str(len(users))) batch_size = 1 #50个会获取不到数据 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前爬取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_user_profile, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks)
class CheckQualificationByRankList(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.live_user_list = [] self.room_id_list = [] self.stupid_key_words = STUPID_KEY_WORDS def get_users(self): users = self.redis_client.get_users() #每次获取分数最低的10000个 return users def is_live_user(self, sec_user_id): return self.redis_client.is_live_user(sec_user_id) def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each) def add_to_live_users(self): for each in self.live_user_list: self.redis_client.add_live_users(each, 1) def increase_user_score(self, sec_user_id): self.redis_client.increase_user_score(sec_user_id) def is_qualified_user(self, user): nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_rank_list(self, sec_user_id): if not self.is_live_user(sec_user_id): try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: user = raw_data.get('data').get('anchor_info').get('user') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if self.is_qualified_user(user): own_room = user.get('own_room') if own_room: #如果有这个,说明直播开始了 room_id = own_room.get('room_ids_str')[0] self.live_user_list.append(sec_user_id) self.room_id_list.append(room_id) self.increase_user_score(sec_user_id) else: self.redis_client.delete_users(sec_user_id) logger.info('删除user-sec_user_id-' + sec_user_id) else: self.increase_user_score(sec_user_id) """ try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: nickname = raw_data.get('data').get('anchor_info').get('user').get('nickname') print(nickname) except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None """ def run(self): users = self.get_users() batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(users)) logger.info('当前获取用户序号-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop]] gevent.joinall(tasks) self.save_rooms() self.add_to_live_users() logger.info('新增room_id-' + str(len(self.room_id_list))) self.room_id_list.clear() self.live_user_list.clear()
class GetItemLists(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.item_lists_saved_list = [] self.item_list = [] def get_room_sec_ids(self): return self.redis_client.get_item_lists() def delete_item_lists(self): for each in self.item_lists_saved_list: self.redis_client.delete_item_lists(each) def save_items(self): self.redis_client.add_items(self.item_list) def get_item_lists(self, room_sec_ids): ids = room_sec_ids.split('_', 1) room_id = ids[0] sec_user_id = ids[1] try: item_list_raw_data = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错-' + e.args[0] + '-room_sec_ids-' + room_sec_ids) return None try: item_list = self.parse_item_lists(item_list_raw_data, room_id, sec_user_id) except Exception as e: logger.error(room_sec_ids + '-parse_item_list失败-' + e.args[0]) return None if len(item_list) != 0: #表示这场直播挂商品了 self.write_to_file(json.dumps(item_list, ensure_ascii=False)) #先写入,再删除,没毛病 self.item_lists_saved_list.append(room_sec_ids) def parse_item_lists(self, item_list_raw_data, room_id, sec_user_id): item_list = [] data = item_list_raw_data.get('promotions') for item in data: item_info = {} item_info['room_id'] = room_id item_info['sec_user_id'] = sec_user_id item_info['title'] = item.get('title') item_info['short_title'] = item.get('short_title') item_info['product_id'] = item.get('product_id') item_info['promotion_id'] = item.get('promotion_id') item_info['price'] = item.get('price') / 100 item_info['min_price'] = item.get('min_price') / 100 item_info['item_source'] = item.get('platform_label') item_info['shop_id'] = item.get('shop_id') item_info['item_type'] = item.get('item_type') item_info['cover'] = item.get('cover') item_info['index'] = item.get('index') coupon_info = item.get('coupons') if coupon_info: item_info['coupon_tag'] = coupon_info[0].get('tag') item_info['coupon_url'] = coupon_info[0].get('coupon_url') item_list.append(item_info) self.item_list.append(item_info['promotion_id'] + '_' + room_id + '_' + sec_user_id) return item_list def write_to_file(self, item_list): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open(FILE_DIRECTORY + '/' + 'item_lists' + '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(item_list + '\n') def run(self): all_room_sec_ids = self.get_room_sec_ids() logger.info('此前已结束直播并需要获取商品信息的直播间数量:' + str(len(all_room_sec_ids))) batch_size = 200 for batch_limit in range(0, len(all_room_sec_ids), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(all_room_sec_ids)) logger.info('待获取的商品所对应的直播间-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_item_lists, room_sec_ids) for room_sec_ids in all_room_sec_ids[start:stop] ] gevent.joinall(tasks) logger.info('新获取商品列表/未挂商品的直播间数量-' + str(len(self.item_lists_saved_list))) logger.info('新获取商品的数量-' + str(len(self.item_list))) self.save_items() #可能是这个拖慢了速度,得想办法 self.delete_item_lists() self.item_lists_saved_list.clear() self.item_list.clear()
class CheckRooms(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.lives_on_list = [] def get_rooms(self): return self.redis_client.get_rooms(0, 0) def change_room_status(self): for each in self.lives_on_list: self.redis_client.add_rooms(each, 1) def check_room(self, room_id): try: room_raw_data = self.get_raw_data.get_live(room_id) except Exception as e: logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id) return None try: owner = room_raw_data.get('data').get('owner') follower_count = owner.get('follow_info').get('follower_count') sec_user_id = owner.get('sec_uid') except Exception as e: logger.error('解析room_raw_data出错-' + e.args[0] + '-room_id-' + room_id) return None if follower_count < 10000: self.redis_client.delete_users(sec_user_id) self.redis_client.delete_rooms(room_id) else: status = room_raw_data.get('data').get('status') if status == 2: try: #判断该场直播是否带货 item_list_raw_data = self.get_raw_data.get_item_list( sec_user_id, room_id) except Exception as e: logger.error('get_item_list出错' + e.args[0] + '-sec_user_id和room_id-' + sec_user_id + '-' + room_id) return None if len(item_list_raw_data.get('promotions')) != 0: self.lives_on_list.append(room_id) def run(self): all_room_ids = self.get_rooms() logger.info('此前未在直播的直播间数量:' + str(len(all_room_ids))) batch_size = 200 for batch_limit in range(0, len(all_room_ids), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(all_room_ids)) logger.info('待查看的此前未在直播的直播间-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.check_room, room_id) for room_id in all_room_ids[start:stop] ] gevent.joinall(tasks) logger.info('新发现开始的直播数量-' + str(len(self.lives_on_list))) self.change_room_status() self.lives_on_list.clear()
class GetPreUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = [ '公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌', '汇', '馆', '裤', '业', '专', '卖' ] def get_following(self): batch = [] for i in range(self.batch_size): batch.append(self.redis_client.get_following()) return batch def add_following_and_pre_users(self): for pre_user in self.pre_user_list: self.redis_client.add_following(pre_user) self.redis_client.add_pre_users(pre_user, -1) def is_qualified_user(self, user): if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason') != '': return False if user.get('custom_verify') != '': return False nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_pre_users(self, sec_user_id): try: raw_data = self.get_raw_data.get_following(sec_user_id) except Exception as e: logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if not raw_data.get('status_code') == 2096: following_list = raw_data.get('followings') for user in following_list: if self.is_qualified_user(user): self.pre_user_list.append(user.get('sec_uid')) else: logger.info('关注不可见-sec_user_id-' + sec_user_id) def run(self): batch = self.get_following() tasks = [ gevent.spawn(self.get_pre_users, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) logger.info('获取到pre_user-' + str(len(self.pre_user_list))) self.add_following_and_pre_users() self.pre_user_list.clear()
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.sec_user_id_list = [] self.room_id_list = []
class GetPreUsers(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = STUPID_KEY_WORDS def count_following(self): return self.redis_client.count_following() def count_pre_users(self): return self.redis_client.count_pre_users() def get_following(self): batch = [] for i in range(self.batch_size): batch.append(self.redis_client.get_following()) return batch def add_following_and_pre_users(self): for pre_user in self.pre_user_list: if self.count_following() < 100000: self.redis_client.add_following(pre_user) if self.count_pre_users() < 500000: self.redis_client.add_pre_users(pre_user) def is_qualified_user(self, user): if user.get('is_gov_media_vip'): return False if user.get('enterprise_verify_reason'): return False if user.get('custom_verify'): if not '自媒体' in user.get('custom_verify'): if not '主播' in user.get('custom_verify'): if not '视频' in user.get('custom_verify'): return False nickname = user.get('nickname') for word in self.stupid_key_words: if word in nickname: return False return True def get_pre_users(self, sec_user_id): try: raw_data = self.get_raw_data.get_following(sec_user_id) except Exception as e: logger.error('get_pre_user出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if not raw_data.get('status_code') == 2096: following_list = raw_data.get('followings') if len(following_list) == 0: logger.error('获取不到数据了,程序退出') #sys.exit() for user in following_list: if self.is_qualified_user(user): self.pre_user_list.append(user.get('sec_uid')) else: logger.info('关注不可见-sec_user_id-' + sec_user_id) def run(self): if self.count_following() < 100000 or self.count_pre_users() < 500000: batch = self.get_following() tasks = [ gevent.spawn(self.get_pre_users, sec_user_id) for sec_user_id in batch ] gevent.joinall(tasks) logger.info('获取到pre_user-' + str(len(self.pre_user_list))) self.add_following_and_pre_users() self.pre_user_list.clear() else: logger.info('已经有太多following或pre_users了')
class GetRankList(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.user_list = [] self.room_id_list = [] def get_users(self): users = self.redis_client.get_live_users(0, 0) return users def save_rooms(self): for each in self.room_id_list: self.redis_client.add_rooms(each) def change_user_status(self): for each in self.user_list: self.redis_client.add_live_users(each, 1) def get_rank_list(self, sec_user_id): try: raw_data = self.get_raw_data.get_rank_list(sec_user_id) except Exception as e: logger.error('get_rank_list出错-' + e.args[0] + '-sec_user_id-' + sec_user_id) return None try: own_room = raw_data.get('data').get('anchor_info').get('user').get( 'own_room') except Exception as e: logger.error('parse_current_room出错' + e.args[0] + '-sec_user_id-' + sec_user_id) return None if own_room: #如果有这个,说明直播开始了 room_id = own_room.get('room_ids_str')[0] self.user_list.append(sec_user_id) self.room_id_list.append(room_id) #logger.info(sec_user_id + '-正在直播,room_id-' + room_id) #else: #logger.info(sec_user_id + '-未在直播') def run(self): users = self.get_users() logger.info('共有未在直播的users-' + str(len(users))) batch_size = 50 #这个接口80个一批可行的(总共近4000个),更多的我不敢再试了 for batch_limit in range(0, len(users), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(users)) logger.info('当前获取用户序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_rank_list, sec_user_id) for sec_user_id in users[start:stop] ] gevent.joinall(tasks) self.save_rooms() self.change_user_status() logger.info('新增room_id-' + str(len(self.room_id_list))) self.room_id_list.clear() self.user_list.clear()
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.batch_size = 10 self.pre_user_list = [] self.stupid_key_words = STUPID_KEY_WORDS
class GetClipsH5(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() def get_aweme_lists(self): aweme_lists = [] awemes = self.redis_client.get_awemes() for i in range(0, len(awemes), 20): aweme_list = awemes[i:i + 20] aweme_lists.append(aweme_list) return aweme_lists def get_clips_h5(self, aweme_list): aweme_int_list = [int(aweme) for aweme in aweme_list] try: raw_data = self.get_raw_data.get_clips_h5(aweme_int_list) except Exception as e: logger.error('get_clips_h5出错-' + e.args[0]) return None if raw_data.get('status_code') != 2053: try: clips = self.parse_clips_h5(raw_data) except Exception as e: logger.error('parse_clips_h5出错-' + e.args[0]) return None if len(clips) != 0: #忘记这个if语句是出于什么目的了,可能是因为一批aweme中可能会有一些不是视频吧。 self.write_to_file(json.dumps(clips, ensure_ascii=False)) for each in aweme_list: self.redis_client.delete_awemes(each) else: logger.error('该组clips数量为0') else: logger.error('status_code 2053,一整批都不是视频') def parse_clips_h5(self, raw_data): clips = [] data = raw_data.get('item_list') for each in data: clip = {} clip['aweme_share_url'] = each.get('share_url') clip['user_id'] = each.get('author_user_id') clip['aweme_duration'] = each.get('duration') clip['aweme_time'] = each.get('create_time') clip['aweme_id'] = each.get('aweme_id') video = each.get('video') clip['aweme_cover'] = video.get('cover').get('url_list')[0] clip['aweme_url'] = video.get('play_addr').get('url_list')[0] author = each.get('author') if author: clip['user_nickname'] = author.get('nickname') clip['user_avatar'] = author.get('avatar_larger').get( 'url_list')[0] clip['user_short_id'] = author.get('short_id') clip['user_signature'] = author.get('signature') clip['user_unique_id'] = author.get('unique_id') else: logger.error('未获取到author') statistics = each.get('statistics') clip['comment_count'] = statistics.get('comment_count') clip['like_count'] = statistics.get('digg_count') clips.append(clip) return clips def write_to_file(self, clips): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open(FILE_DIRECTORY + '/' + 'clips_h5' + '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(clips + '\n') def run(self): aweme_lists = self.get_aweme_lists() logger.info('共有aweme组数:' + str(len(aweme_lists))) batch_size = 200 for batch_limit in range(0, len(aweme_lists), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(aweme_lists)) logger.info('get_clips_h5爬取当前aweme组序号-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_clips_h5, aweme_list) for aweme_list in aweme_lists[start:stop] ] gevent.joinall(tasks)
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.stupid_key_words = STUPID_KEY_WORDS
class GetLives(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.lives_off_list = [] def get_rooms(self): return self.redis_client.get_rooms() def delete_rooms(self): #直播结束了,就将用户的状态改回0,删除room_id,并添加item_list记录 for each in self.lives_off_list: room_id = each.split('_', 1)[0] sec_user_id = each.split('_', 1)[1] self.redis_client.add_live_users(sec_user_id, 0) self.redis_client.delete_rooms(room_id) self.redis_client.add_item_lists(each) def get_lives(self, room_id): try: live_raw_data = self.get_raw_data.get_live(room_id) except Exception as e: logger.error('get_live出错-' + e.args[0] + '-room_id-' + room_id) return None try: live_info = self.parse_lives(live_raw_data, room_id) except Exception as e: logger.error('parse_lives出错-' + e.args[0] + '-room_id-' + room_id) return None self.write_to_file(live_info) if live_info['status'] == 4: #写入文件成功之后才删除 self.lives_off_list.append(live_info['room_id'] + '_' + live_info['sec_user_id']) def parse_lives(self, live_raw_data, room_id): data = live_raw_data.get('data') live_info = {} live_info['room_id'] = room_id live_info['start_time'] = data.get('create_time') #直播开始时间 live_info['like_count'] = data.get('like_count') live_info['share_url'] = data.get('share_url') live_info['title'] = data.get('title') live_info['status'] = data.get('status') #2为正在直播,4为直播结束 live_info['viewer_count'] = data.get('user_count') #实时观看人数 live_info['cover_url'] = data.get('cover').get('url_list')[0] owner = data.get('owner') live_info['avatar_url'] = owner.get('avatar_large').get('url_list')[0] live_info['city'] = owner.get('city') live_info['follower_count'] = owner.get('follow_info').get( 'follower_count') live_info['gender'] = owner.get('gender') live_info['short_id'] = owner.get('short_id') #主播短id live_info['id'] = owner.get('id_str') #主播长id live_info['nickname'] = owner.get('nickname') live_info['signature'] = owner.get('signature') live_info['short_id'] = owner.get('short_id') live_info['sec_user_id'] = owner.get('sec_uid') #live_info['mobile'] = owner.get('telephone') #主播手机 live_info['ticket_count'] = owner.get('ticket_count') #主播总音浪 live_info['create_time'] = str(int(time.time())) #记录时间 stats = data.get('stats') live_info['fan_ticket'] = stats.get('fan_ticket') #本场收入音浪 live_info['follow_count'] = stats.get('follow_count') #本场关注 live_info['gift_count'] = stats.get('gift_uv_count') #本场获得礼物 live_info['total_viewer'] = stats.get('total_user') #总观看人数 return live_info def write_to_file(self, live_info): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open(FILE_DIRECTORY + '/' + 'lives' + '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(json.dumps(live_info, ensure_ascii=False) + '\n') def run(self): all_room_ids = self.get_rooms() logger.info('此前正在直播的直播间数量:' + str(len(all_room_ids))) batch_size = 150 #本地200可以,云手机200不行 for batch_limit in range(0, len(all_room_ids), batch_size): start = batch_limit stop = min(batch_limit + batch_size, len(all_room_ids)) logger.info('待查看并获取的此前正在直播的直播间-' + str(start + 1) + '-' + str(stop)) tasks = [ gevent.spawn(self.get_lives, room_id) for room_id in all_room_ids[start:stop] ] gevent.joinall(tasks) logger.info('新发现已结束的直播数量-' + str(len(self.lives_off_list))) self.delete_rooms() self.lives_off_list.clear()
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.item_lists_saved_list = [] self.item_list = []
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.stupid_key_words = ['公司', '店', '铺', '厂', '行', '鞋', '装', '市', '服', '饰', '商', '贸', '牌', '汇', '馆', '裤', '业', '专', '卖', '时尚','穿', '搭', '品', '玩具', '语文', '数学', '英语', '科学', '物理', '化学', '生物', '政治', '历史']
def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.live_user_list = [] self.room_id_list = [] self.stupid_key_words = STUPID_KEY_WORDS
class GetItems(): def __init__(self): self.get_raw_data = GetRawData() self.redis_client = RedisClient() self.saved_items_list = [] def get_item_ids(self): return self.redis_client.get_items() def delete_items(self): for each in self.saved_items_list: self.redis_client.delete_items(each) def get_items(self, item): ids = item.split('_', 2) promotion_id = ids[0] room_id = ids[1] sec_user_id = ids[2] try: item_raw_data = self.get_raw_data.get_item(promotion_id) except Exception as e: logger.error('get_item出错-item-' + item) return None try: item_info = self.parse_items(item_raw_data, room_id, sec_user_id) except Exception as e: logger.error('parse_items出错-' + e.args[0] + '-item-'+ item) return None self.write_to_file(item_info) self.saved_items_list.append(item) def parse_items(self, item_raw_data, room_id, sec_user_id): item_info = {} item_info['room_id'] = room_id item_info['sec_user_id'] = sec_user_id item_raw_data = json.loads(item_raw_data.get('promotion'))[0] item_info['promotion_id'] = item_raw_data.get('promotion_id') item_info['product_id'] = item_raw_data.get('product_id') item_info['title'] = item_raw_data.get('title') item_info['sales'] = item_raw_data.get('sales') item_info['detail_url'] = item_raw_data.get('detail_url') item_info['image_url'] = item_raw_data.get('images')[0].get('url_list')[0] item_info['price'] = item_raw_data.get('price')/100 item_info['market_price'] = '' market_price = item_raw_data.get('market_price') if market_price: item_info['market_price'] = market_price/100 return item_info def write_to_file(self, item_info): today = time.strftime('%Y-%m-%d', time.localtime()) today = today.replace('-', '') with open (FILE_DIRECTORY + '/' + 'items' + '_' + today + '.txt', 'a', encoding='utf-8') as file: file.write(json.dumps(item_info, ensure_ascii=False) + '\n') def run(self): all_items = self.get_item_ids() logger.info('共有item数量-' + str(len(all_items))) batch_size = 200 for batch_limit in range(0, len(all_items), batch_size): start = batch_limit stop = min(batch_limit+batch_size, len(all_items)) logger.info('当前获取的item-' + str(start+1) + '-' + str(stop)) tasks = [gevent.spawn(self.get_items, item) for item in all_items[start:stop]] gevent.joinall(tasks) logger.info('获取到items-' + str(len(self.saved_items_list))) self.delete_items() self.saved_items_list.clear()