Example #1
0
def call_crawlers(dvdid_list: list, used_crawlers=None):
    """抓取影片数据

    Args:
        dvdid_list (list): 影片番号的列表
        crawlers (list[str], optional): 要使用的抓取器,未指定时将使用全部抓取器
    """
    if used_crawlers:
        crawlers = {i:all_crawler[i] for i in used_crawlers}
    else:
        crawlers = all_crawler
    outer_bar = tqdm(dvdid_list, desc='抓取影片数据', leave=False)
    for avid in outer_bar:
        success, fail = [], []
        outer_bar.set_description(f'抓取影片数据: {avid}')
        inner_bar = tqdm(crawlers.items(), desc='抓取器', leave=False)
        for name, parser in inner_bar:
            inner_bar.set_description(f'正在抓取{name}'.rjust(10+len(avid)))
            # 每次都会创建一个全新的实例,所以不同抓取器的结果之间不会有影响
            if name != 'fanza':
                movie = MovieInfo(avid)
            else:
                movie = MovieInfo(cid=avid)
            try:
                parser(movie)
                path = f"{data_dir}{os.sep}{avid} ({name}).json"
                movie.dump(path)
                success.append(name)
            except:
                fail.append(name)
        out = "{} 抓取完成: 成功{}个 {}; 失败{}个 {}".format(avid, len(success), ' '.join(success), len(fail), ' '.join(fail))
        tqdm.write(out)
Example #2
0
def compare(avid, scraper, file):
    """从本地的数据文件生成Movie实例,并与在线抓取到的数据进行比较"""
    local = MovieInfo(from_file=file)
    if scraper != 'fanza':
        online = MovieInfo(avid)
    else:
        online = MovieInfo(cid=avid)
    parse_data = getattr(sys.modules[f'web.{scraper}'], 'parse_data')
    parse_data(online)
    # 解包数据再进行比较,以便测试不通过时快速定位不相等的键值
    local_vars = vars(local)
    online_vars = vars(online)
    try:
        for k, v in online_vars.items():
            # 部分字段可能随时间变化,因此只要这些字段不是一方有值一方无值就行
            if k in ['score', 'magnet']:
                assert bool(v) == bool(local_vars.get(k, None))
            elif k == 'preview_video' and scraper in ['airav', 'javdb']:
                assert bool(v) == bool(local_vars.get(k, None))
            # JavBus采用免代理域名时图片地址也会是免代理域名,因此只比较path部分即可
            elif k == 'cover' and scraper == 'javbus':
                assert urlsplit(v).path == urlsplit(local_vars.get(k, None)).path
            elif k == 'actress_pics' and scraper == 'javbus':
                local_tmp = online_tmp = {}
                local_pics = local_vars.get('actress_pics')
                if local_pics:
                    local_tmp = {name: urlsplit(url).path for name, url in local_pics.items()}
                if v:
                    online_tmp = {name: urlsplit(url).path for name, url in v.items()}
                assert local_tmp == online_tmp
            # 对顺序没有要求的list型字段,比较时也应该忽略顺序信息
            elif k in ['genre', 'genre_id', 'genre_norm', 'actress']:
                if isinstance(v, list):
                    assert sorted(v) == sorted(local_vars.get(k, []))
                else:
                    assert v == local_vars.get(k, None)
            else:
                assert v == local_vars.get(k, None)
    except AssertionError:
        # 本地运行时更新已有的测试数据,方便利用版本控制系统检查差异项
        if not os.getenv('GITHUB_ACTIONS'):
            online.dump(file)
        raise