Example #1
0
def translate_movie_info(info: MovieInfo):
    """根据配置翻译影片信息"""
    # 翻译标题
    if info.title and cfg.Translate.translate_title:
        result = translate(info.title, cfg.Translate.engine, info.actress)
        if 'trans' in result:
            info.ori_title = info.title
            info.title = result['trans']
            # 如果有的话,附加断句信息
            if 'orig_break' in result:
                setattr(info, 'ori_title_break', result['orig_break'])
            if 'trans_break' in result:
                setattr(info, 'title_break', result['trans_break'])
        else:
            logger.error('翻译标题时出错: ' + result['error'])
            return False
    # 翻译简介
    if info.plot and cfg.Translate.translate_plot:
        result = translate(info.plot, cfg.Translate.engine, info.actress)
        if 'trans' in result:
            # 只有翻译过plot的影片才可能需要ori_plot属性,因此在运行时动态添加,而不添加到类型定义里
            setattr(info, 'ori_plot', info.plot)
            info.plot = result['trans']
        else:
            logger.error('翻译简介时出错: ' + result['error'])
            return False
    return True
Example #2
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
    url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
    resp = request.get(url).json()
    if resp['count'] == 0:
        barcode = search_movie(movie.dvdid)
        if barcode:
            url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW'
            resp = request.get(url).json()
        else:
            logger.debug(f"'{movie.dvdid}': airav无资源")
            return

    # 从API返回的数据中提取需要的字段
    # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
    data = resp['result']
    dvdid = data['barcode']
    movie.url = base_url + '/video/' + dvdid
    # plot和title中可能含有HTML的转义字符,需要进行解转义处理
    movie.plot = unescape(data['description']) or None
    movie.cover = data['img_url']
    # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
    movie.genre = [i['name'] for i in data['tags']]
    movie.title = unescape(data['name'])
    movie.actress = [i['name'] for i in data['actors']]
    movie.publish_date = data['publish_date']
    movie.preview_pics = data['images'] or []
    if data['factories']:
        movie.producer = data['factories'][0]['name']

    if cfg.Crawler.hardworking_mode:
        # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
        video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
        resp = request.get(video_url).json()
        # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
        if 'data' in resp:
            # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
            # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
            movie.preview_video = resp['data'].get('url')

    # airav上部分影片会被标记为'馬賽克破壞版',这些影片的title、plot和genre都不再准确
    if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot):
        movie.title = None
        movie.plot = None
        movie.genre = None
Example #3
0
def parse_data_raw(movie: MovieInfo, html):
    """解析指定番号的影片数据"""
    container = html.xpath("//div[@class='section product_layout_01']")[0]
    title = container.xpath("div/h1")[0].text_content().strip()
    cover = container.xpath("div/p/a[@class='sample_image']/@href")[0]
    # 这里使用following-sibling而不是getnext,因为getnext会获取到空格、tab等空文本
    actress = container.xpath("//dt[text()='出演:']/following-sibling::dd[1]/a/text()")
    # 移除女优名中的空格,使女优名与其他网站保持一致
    actress = [i.replace(' ', '') for i in actress]
    duration_str = container.xpath("//dt[text()='収録時間:']")[0].getnext().text_content()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    date_str = container.xpath("//dt[text()='発売日:']/following-sibling::dd[1]/a/text()")[0]
    publish_date = date_str.replace('/', '-')
    producer = container.xpath("//dt[text()='メーカー名:']/following-sibling::dd[1]/a/text()")[0]
    dvdid = container.xpath("//dt[text()='品番:']")[0].getnext().text_content()
    genre_tags = container.xpath("//dt[text()='ジャンル:']/following-sibling::dd[1]/a")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('=')[-1])
    serial = container.xpath("//dt[text()='レーベル:']/following-sibling::dd[1]/a/text()")[0]
    plot = container.xpath("//h2[text()='レビュー']/following-sibling::p")[0].text.strip()
    preview_pics = container.xpath("//li/a[@class='sample_image']/@href")

    # 对于2016年开始的影片,尝试获取高清封面地址(但也并不是每部影片都有,特别是2016年早期)
    year = int(publish_date.split('-')[0])
    if year >= 2016:
        # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址,移除其中的'_e'后即为高清封面
        big_cover = cover.replace('_e_', '_')
        movie.big_cover = big_cover

    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.publish_date = publish_date
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.serial = serial
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False    # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
Example #4
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/product/product_detail/{movie.dvdid}/'
    resp = request.get(url)
    # url不存在时会被重定向至主页。history非空时说明发生了重定向
    if resp.history:
        logger.debug(f"'{movie.dvdid}': mgstage无资源")
        return
    html = resp2html(resp)
    # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除
    title = html.xpath(
        "//div[@class='common_detail_cover']/h1/text()")[0].strip()
    container = html.xpath("//div[@class='detail_left']")[0]
    cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
    # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
    actress_text = container.xpath(
        "//th[text()='出演:']/following-sibling::td/text()")
    actress_link = container.xpath(
        "//th[text()='出演:']/following-sibling::td/a/text()")
    actress = [i.strip() for i in actress_text + actress_link]
    actress = [i for i in actress if i]  # 移除空字符串
    producer = container.xpath(
        "//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
    duration_str = container.xpath(
        "//th[text()='収録時間:']/following-sibling::td/text()")[0]
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    dvdid = container.xpath(
        "//th[text()='品番:']/following-sibling::td/text()")[0]
    date_str = container.xpath(
        "//th[text()='配信開始日:']/following-sibling::td/text()")[0]
    publish_date = date_str.replace('/', '-')
    serial = container.xpath(
        "//th[text()='シリーズ:']/following-sibling::td/a/text()")[0].strip()
    # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
    # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip()
    genre_tags = container.xpath(
        "//th[text()='ジャンル:']/following-sibling::td/a")
    genre = [i.text.strip() for i in genre_tags]
    score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
    match = re.search(r'^[\.\d]+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'
    # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
    plots = []
    plot_p_tags = container.xpath(
        "//dl[@id='introduction']/dd/p[not(@class='more')]")
    for p in plot_p_tags:
        children = p.getchildren()
        # 没有children时表明plot不含有格式,此时简单地提取文本就可以
        if not children:
            plots.append(p.text_content())
            continue
        for child in children:
            if child.tag == 'br' and plots[-1] != '\n':
                plots.append('\n')
            else:
                if child.text:
                    plots.append(child.text)
                if child.tail:
                    plots.append(child.tail)
    plot = ''.join(plots).strip()
    preview_pics = container.xpath("//a[@class='sample_image']/@href")

    if cfg.Crawler.hardworking_mode:
        # 预览视频是点击按钮后再加载的,不在静态网页中
        btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
        video_pid = btn_url.split('/')[-1]
        req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
        resp = request.get(req_url).json()
        video_url = resp.get('url')
        if video_url:
            # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
            preview_video = video_url.split('.ism/')[0] + '.mp4'
            movie.preview_video = preview_video

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.actress = actress
    movie.producer = producer
    movie.publish_date = publish_date
    movie.serial = serial
    movie.genre = genre
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片
Example #5
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
    html = request.get_html(url)
    if 'not available in your region' in html.text_content():
        logger.error('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
        return
    title = html.xpath("//h1[@id='title']/text()")[0]
    # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
    container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
    cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
    # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
    date_str = container.xpath(
        "//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip()
    publish_date = date_str.replace('/', '-')
    duration_str = container.xpath(
        "//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
    actress = container.xpath("//span[@id='performer']/a/text()")
    director_tag = container.xpath(
        "//td[text()='監督:']/following-sibling::td/a/text()")
    if director_tag:
        movie.director = director_tag[0].strip()
    serial_tag = container.xpath(
        "//td[text()='シリーズ:']/following-sibling::td/a/text()")
    if serial_tag:
        movie.serial = serial_tag[0].strip()
    producer_tag = container.xpath(
        "//td[text()='メーカー:']/following-sibling::td/a/text()")
    if producer_tag:
        movie.producer = producer_tag[0].strip()
    # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
    # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
    # if label_tag:
    #     label = label_tag[0].strip()
    # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
    genre_tags = container.xpath(
        "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]"
    )
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text.strip())
        genre_id.append(tag.get('href').split('=')[-1].strip('/'))
    cid = container.xpath(
        "//td[text()='品番:']/following-sibling::td/text()")[0].strip()
    plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip()
    preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
    score_str = container.xpath(
        "//p[@class='d-review__average']/strong/text()")[0].strip()
    match = re.search(r'\d+', score_str)
    if match:
        score = float(match.group()) * 2
        movie.score = f'{score:.2f}'

    if cfg.Crawler.hardworking_mode:
        # 预览视频是动态加载的,不在静态网页中
        video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
        html2 = request.get_html(video_url)
        # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
        script = html2.xpath(
            "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()"
        )[0].strip()
        match = re.search(r'\{.*\}', script)
        # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
        try:
            data = json.loads(match.group())
            video_url = data.get('src')
            if video_url and video_url.startswith('//'):
                video_url = 'https:' + video_url
            movie.preview_video = video_url
        except Exception as e:
            logger.debug('解析视频地址时异常: ' + repr(e))

    movie.url = url
    movie.title = title
    movie.cover = cover
    movie.publish_date = publish_date
    movie.actress = actress
    movie.genre = genre
    movie.genre_id = genre_id
    movie.plot = plot
    movie.preview_pics = preview_pics
    movie.uncensored = False  # 服务器在日本且面向日本国内公开发售,不会包含无码片
Example #6
0
def parse_data(movie: MovieInfo):
    """解析指定番号的影片数据"""
    html = post_html(f'{base_url}/search', data={'sn': movie.dvdid})
    page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
    #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
    cid = page_url.split('/')[-1]  # /video/ipx00177
    # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
    if cid == 'search':
        return
    title = html.xpath("//div[@class='panel-heading']/h3/text()")[0]
    info = html.xpath("//div[@class='col-md-9']")[0]
    # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签
    producer = info.xpath("a[contains(@href,'/company/')]/text()")[0]
    # actress, actress_pics
    actress, actress_pics = [], {}
    actress_tags = html.xpath(
        "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
    for tag in actress_tags:
        name = tag.tail.strip()
        pic_url = tag.get('src')
        actress.append(name)
        # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
        # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
        actress_pics[name] = pic_url
    # genre, genre_id
    genre_tags = info.xpath("a[contains(@href,'/genre/')]")
    genre, genre_id = [], []
    for tag in genre_tags:
        genre.append(tag.text)
        genre_id.append(tag.get('href').split('/')[-2])  # genre/4025/1
    dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
    publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
    duration_str = info.xpath("b[text()='収録時間']")[0].tail
    match = re.search(r'\d+', duration_str)
    if match:
        movie.duration = match.group(0)
    # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
    score_tag = info.xpath(
        "//b[text()='平均評価']/following-sibling::img/@data-original")
    if score_tag:
        score = int(score_tag[0][5:7]) / 5  # /10*2
        movie.score = str(score)
    serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
    if serial_tag:
        movie.serial = serial_tag[0]
    preview_video_tag = info.xpath("//video/source/@src")
    if preview_video_tag:
        movie.preview_video = preview_video_tag[0]
    plot_tag = info.xpath(
        "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()"
    )
    if plot_tag:
        movie.plot = plot_tag[0]
    preview_pics = html.xpath(
        "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src"
    )
    # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析

    movie.url = page_url
    movie.cid = cid
    movie.title = title
    movie.actress = actress
    movie.actress_pics = actress_pics
    movie.producer = producer
    movie.genre = genre
    movie.genre_id = genre_id
    movie.publish_date = publish_date
    # preview_pics的第一张图始终是封面,剩下的才是预览图
    movie.cover = preview_pics[0]
    movie.preview_pics = preview_pics[1:]