def call_crawlers(dvdid_list: list, used_crawlers=None): """抓取影片数据 Args: dvdid_list (list): 影片番号的列表 crawlers (list[str], optional): 要使用的抓取器,未指定时将使用全部抓取器 """ if used_crawlers: crawlers = {i:all_crawler[i] for i in used_crawlers} else: crawlers = all_crawler outer_bar = tqdm(dvdid_list, desc='抓取影片数据', leave=False) for avid in outer_bar: success, fail = [], [] outer_bar.set_description(f'抓取影片数据: {avid}') inner_bar = tqdm(crawlers.items(), desc='抓取器', leave=False) for name, parser in inner_bar: inner_bar.set_description(f'正在抓取{name}'.rjust(10+len(avid))) # 每次都会创建一个全新的实例,所以不同抓取器的结果之间不会有影响 if name != 'fanza': movie = MovieInfo(avid) else: movie = MovieInfo(cid=avid) try: parser(movie) path = f"{data_dir}{os.sep}{avid} ({name}).json" movie.dump(path) success.append(name) except: fail.append(name) out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(avid, len(success), ' '.join(success), len(fail), ' '.join(fail)) tqdm.write(out)
def compare(avid, scraper, file): """从本地的数据文件生成Movie实例,并与在线抓取到的数据进行比较""" local = MovieInfo(from_file=file) if scraper != 'fanza': online = MovieInfo(avid) else: online = MovieInfo(cid=avid) parse_data = getattr(sys.modules[f'web.{scraper}'], 'parse_data') parse_data(online) # 解包数据再进行比较,以便测试不通过时快速定位不相等的键值 local_vars = vars(local) online_vars = vars(online) try: for k, v in online_vars.items(): # 部分字段可能随时间变化,因此只要这些字段不是一方有值一方无值就行 if k in ['score', 'magnet']: assert bool(v) == bool(local_vars.get(k, None)) elif k == 'preview_video' and scraper in ['airav', 'javdb']: assert bool(v) == bool(local_vars.get(k, None)) # JavBus采用免代理域名时图片地址也会是免代理域名,因此只比较path部分即可 elif k == 'cover' and scraper == 'javbus': assert urlsplit(v).path == urlsplit(local_vars.get(k, None)).path elif k == 'actress_pics' and scraper == 'javbus': local_tmp = online_tmp = {} local_pics = local_vars.get('actress_pics') if local_pics: local_tmp = {name: urlsplit(url).path for name, url in local_pics.items()} if v: online_tmp = {name: urlsplit(url).path for name, url in v.items()} assert local_tmp == online_tmp # 对顺序没有要求的list型字段,比较时也应该忽略顺序信息 elif k in ['genre', 'genre_id', 'genre_norm', 'actress']: if isinstance(v, list): assert sorted(v) == sorted(local_vars.get(k, [])) else: assert v == local_vars.get(k, None) else: assert v == local_vars.get(k, None) except AssertionError: # 本地运行时更新已有的测试数据,方便利用版本控制系统检查差异项 if not os.getenv('GITHUB_ACTIONS'): online.dump(file) raise