def translate_movie_info(info: MovieInfo): """根据配置翻译影片信息""" # 翻译标题 if info.title and cfg.Translate.translate_title: result = translate(info.title, cfg.Translate.engine, info.actress) if 'trans' in result: info.ori_title = info.title info.title = result['trans'] # 如果有的话,附加断句信息 if 'orig_break' in result: setattr(info, 'ori_title_break', result['orig_break']) if 'trans_break' in result: setattr(info, 'title_break', result['trans_break']) else: logger.error('翻译标题时出错: ' + result['error']) return False # 翻译简介 if info.plot and cfg.Translate.translate_plot: result = translate(info.plot, cfg.Translate.engine, info.actress) if 'trans' in result: # 只有翻译过plot的影片才可能需要ori_plot属性,因此在运行时动态添加,而不添加到类型定义里 setattr(info, 'ori_plot', info.plot) info.plot = result['trans'] else: logger.error('翻译简介时出错: ' + result['error']) return False return True
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' resp = request.get(url).json() if resp['count'] == 0: barcode = search_movie(movie.dvdid) if barcode: url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW' resp = request.get(url).json() else: logger.debug(f"'{movie.dvdid}': airav无资源") return # 从API返回的数据中提取需要的字段 # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 data = resp['result'] dvdid = data['barcode'] movie.url = base_url + '/video/' + dvdid # plot和title中可能含有HTML的转义字符,需要进行解转义处理 movie.plot = unescape(data['description']) or None movie.cover = data['img_url'] # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id movie.genre = [i['name'] for i in data['tags']] movie.title = unescape(data['name']) movie.actress = [i['name'] for i in data['actors']] movie.publish_date = data['publish_date'] movie.preview_pics = data['images'] or [] if data['factories']: movie.producer = data['factories'][0]['name'] if cfg.Crawler.hardworking_mode: # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" resp = request.get(video_url).json() # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} if 'data' in resp: # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 movie.preview_video = resp['data'].get('url') # airav上部分影片会被标记为'馬賽克破壞版',这些影片的title、plot和genre都不再准确 if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot): movie.title = None movie.plot = None movie.genre = None
def parse_data_raw(movie: MovieInfo, html): """解析指定番号的影片数据""" container = html.xpath("//div[@class='section product_layout_01']")[0] title = container.xpath("div/h1")[0].text_content().strip() cover = container.xpath("div/p/a[@class='sample_image']/@href")[0] # 这里使用following-sibling而不是getnext,因为getnext会获取到空格、tab等空文本 actress = container.xpath("//dt[text()='出演:']/following-sibling::dd[1]/a/text()") # 移除女优名中的空格,使女优名与其他网站保持一致 actress = [i.replace(' ', '') for i in actress] duration_str = container.xpath("//dt[text()='収録時間:']")[0].getnext().text_content() match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) date_str = container.xpath("//dt[text()='発売日:']/following-sibling::dd[1]/a/text()")[0] publish_date = date_str.replace('/', '-') producer = container.xpath("//dt[text()='メーカー名:']/following-sibling::dd[1]/a/text()")[0] dvdid = container.xpath("//dt[text()='品番:']")[0].getnext().text_content() genre_tags = container.xpath("//dt[text()='ジャンル:']/following-sibling::dd[1]/a") genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text) genre_id.append(tag.get('href').split('=')[-1]) serial = container.xpath("//dt[text()='レーベル:']/following-sibling::dd[1]/a/text()")[0] plot = container.xpath("//h2[text()='レビュー']/following-sibling::p")[0].text.strip() preview_pics = container.xpath("//li/a[@class='sample_image']/@href") # 对于2016年开始的影片,尝试获取高清封面地址(但也并不是每部影片都有,特别是2016年早期) year = int(publish_date.split('-')[0]) if year >= 2016: # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址,移除其中的'_e'后即为高清封面 big_cover = cover.replace('_e_', '_') movie.big_cover = big_cover movie.title = title movie.cover = cover movie.actress = actress movie.publish_date = publish_date movie.producer = producer movie.genre = genre movie.genre_id = genre_id movie.serial = serial movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" url = f'{base_url}/product/product_detail/{movie.dvdid}/' resp = request.get(url) # url不存在时会被重定向至主页。history非空时说明发生了重定向 if resp.history: logger.debug(f"'{movie.dvdid}': mgstage无资源") return html = resp2html(resp) # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 title = html.xpath( "//div[@class='common_detail_cover']/h1/text()")[0].strip() container = html.xpath("//div[@class='detail_left']")[0] cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 actress_text = container.xpath( "//th[text()='出演:']/following-sibling::td/text()") actress_link = container.xpath( "//th[text()='出演:']/following-sibling::td/a/text()") actress = [i.strip() for i in actress_text + actress_link] actress = [i for i in actress if i] # 移除空字符串 producer = container.xpath( "//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() duration_str = container.xpath( "//th[text()='収録時間:']/following-sibling::td/text()")[0] match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) dvdid = container.xpath( "//th[text()='品番:']/following-sibling::td/text()")[0] date_str = container.xpath( "//th[text()='配信開始日:']/following-sibling::td/text()")[0] publish_date = date_str.replace('/', '-') serial = container.xpath( "//th[text()='シリーズ:']/following-sibling::td/a/text()")[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip() genre_tags = container.xpath( "//th[text()='ジャンル:']/following-sibling::td/a") genre = [i.text.strip() for i in genre_tags] score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() match = re.search(r'^[\.\d]+', score_str) if match: score = float(match.group()) * 2 movie.score = f'{score:.2f}' # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 plots = [] plot_p_tags = container.xpath( "//dl[@id='introduction']/dd/p[not(@class='more')]") for p in plot_p_tags: children = p.getchildren() # 没有children时表明plot不含有格式,此时简单地提取文本就可以 if not children: plots.append(p.text_content()) continue for child in children: if child.tag == 'br' and plots[-1] != '\n': plots.append('\n') else: if child.text: plots.append(child.text) if child.tail: plots.append(child.tail) plot = ''.join(plots).strip() preview_pics = container.xpath("//a[@class='sample_image']/@href") if cfg.Crawler.hardworking_mode: # 预览视频是点击按钮后再加载的,不在静态网页中 btn_url = container.xpath("//a[@class='button_sample']/@href")[0] video_pid = btn_url.split('/')[-1] req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' resp = request.get(req_url).json() video_url = resp.get('url') if video_url: # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX preview_video = video_url.split('.ism/')[0] + '.mp4' movie.preview_video = preview_video movie.url = url movie.title = title movie.cover = cover movie.actress = actress movie.producer = producer movie.publish_date = publish_date movie.serial = serial movie.genre = genre movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' html = request.get_html(url) if 'not available in your region' in html.text_content(): logger.error('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') return title = html.xpath("//h1[@id='title']/text()")[0] # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 container = html.xpath("//table[@class='mg-b12']/tr/td")[0] cover = container.xpath("//div[@id='sample-video']/a/@href")[0] # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 date_str = container.xpath( "//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip() publish_date = date_str.replace('/', '-') duration_str = container.xpath( "//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 actress = container.xpath("//span[@id='performer']/a/text()") director_tag = container.xpath( "//td[text()='監督:']/following-sibling::td/a/text()") if director_tag: movie.director = director_tag[0].strip() serial_tag = container.xpath( "//td[text()='シリーズ:']/following-sibling::td/a/text()") if serial_tag: movie.serial = serial_tag[0].strip() producer_tag = container.xpath( "//td[text()='メーカー:']/following-sibling::td/a/text()") if producer_tag: movie.producer = producer_tag[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") # if label_tag: # label = label_tag[0].strip() # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 genre_tags = container.xpath( "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]" ) genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text.strip()) genre_id.append(tag.get('href').split('=')[-1].strip('/')) cid = container.xpath( "//td[text()='品番:']/following-sibling::td/text()")[0].strip() plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip() preview_pics = container.xpath("//a[@name='sample-image']/img/@src") score_str = container.xpath( "//p[@class='d-review__average']/strong/text()")[0].strip() match = re.search(r'\d+', score_str) if match: score = float(match.group()) * 2 movie.score = f'{score:.2f}' if cfg.Crawler.hardworking_mode: # 预览视频是动态加载的,不在静态网页中 video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}' html2 = request.get_html(video_url) # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 script = html2.xpath( "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()" )[0].strip() match = re.search(r'\{.*\}', script) # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 try: data = json.loads(match.group()) video_url = data.get('src') if video_url and video_url.startswith('//'): video_url = 'https:' + video_url movie.preview_video = video_url except Exception as e: logger.debug('解析视频地址时异常: ' + repr(e)) movie.url = url movie.title = title movie.cover = cover movie.publish_date = publish_date movie.actress = actress movie.genre = genre movie.genre_id = genre_id movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" html = post_html(f'{base_url}/search', data={'sn': movie.dvdid}) page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 cid = page_url.split('/')[-1] # /video/ipx00177 # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 if cid == 'search': return title = html.xpath("//div[@class='panel-heading']/h3/text()")[0] info = html.xpath("//div[@class='col-md-9']")[0] # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签 producer = info.xpath("a[contains(@href,'/company/')]/text()")[0] # actress, actress_pics actress, actress_pics = [], {} actress_tags = html.xpath( "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") for tag in actress_tags: name = tag.tail.strip() pic_url = tag.get('src') actress.append(name) # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 actress_pics[name] = pic_url # genre, genre_id genre_tags = info.xpath("a[contains(@href,'/genre/')]") genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text) genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') duration_str = info.xpath("b[text()='収録時間']")[0].tail match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 score_tag = info.xpath( "//b[text()='平均評価']/following-sibling::img/@data-original") if score_tag: score = int(score_tag[0][5:7]) / 5 # /10*2 movie.score = str(score) serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") if serial_tag: movie.serial = serial_tag[0] preview_video_tag = info.xpath("//video/source/@src") if preview_video_tag: movie.preview_video = preview_video_tag[0] plot_tag = info.xpath( "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()" ) if plot_tag: movie.plot = plot_tag[0] preview_pics = html.xpath( "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src" ) # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 movie.url = page_url movie.cid = cid movie.title = title movie.actress = actress movie.actress_pics = actress_pics movie.producer = producer movie.genre = genre movie.genre_id = genre_id movie.publish_date = publish_date # preview_pics的第一张图始终是封面,剩下的才是预览图 movie.cover = preview_pics[0] movie.preview_pics = preview_pics[1:]