def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" html_file = f'{base_path}/{movie.dvdid}.html' if not os.path.exists(html_file): logger.debug(f"未找到fc2fan镜像网页: '{html_file}'") return html = lxml.html.parse(html_file) container = html.xpath("//div[@class='col-sm-8']")[0] title = container.xpath("h3/text()")[0] score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip() match = re.search(r'\d+', score_str) if match: score = int(match.group()) / 10 # fc2fan站长是按100分来打分的 movie.score = f'{score:.1f}' resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail if '无码' in resource_info: movie.uncensored = True elif '有码' in resource_info: movie.uncensored = False # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商 producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text.strip() genre = container.xpath("h5/strong[text()='影片标签']/../a/text()") actress = container.xpath("h5/strong[text()='女优名字']/../a/text()") preview_pics = container.xpath("//ul[@class='slides']/li/img/@src") preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics] # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到 movie.title = title movie.genre = genre movie.actress = actress movie.producer = producer if preview_pics: movie.preview_pics = preview_pics movie.cover = preview_pics[0]
def parse_data_raw(movie: MovieInfo, html): """解析指定番号的影片数据""" container = html.xpath("/html/body/div[@class='container']")[0] title = container.xpath("h3/text()")[0] cover = container.xpath("//a[@class='bigImage']/img/@src")[0] preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href") info = container.xpath("//div[@class='col-md-3 info']")[0] dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() director_tag = info.xpath("p/span[text()='導演:']") if director_tag: # xpath没有匹配时将得到空列表 movie.director = director_tag[0].getnext().text.strip() producer = info.xpath("p/span[text()='製作商:']")[0].getnext().text.strip() publisher_tag = info.xpath("p/span[text()='發行商:']") if publisher_tag: movie.publisher = publisher_tag[0].getnext().text.strip() serial_tag = info.xpath("p/span[text()='系列:']") if serial_tag: movie.serial = serial_tag[0].getnext().text # genre, genre_id genre_tags = info.xpath("//span[@class='genre']/label/a") genre, genre_id = [], [] for tag in genre_tags: tag_url = tag.get('href') pre_id = tag_url.split('/')[-1] genre.append(tag.text) if 'uncensored' in tag_url: movie.uncensored = True genre_id.append('uncensored-' + pre_id) else: movie.uncensored = False genre_id.append(pre_id) # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析 # actress, actress_pics actress, actress_pics = [], {} actress_tags = html.xpath("//a[@class='avatar-box']/div/img") for tag in actress_tags: name = tag.get('title') pic_url = tag.get('src') actress.append(name) if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 actress_pics[name] = pic_url # 整理数据并更新movie的相应属性 movie.title = title.replace(dvdid, '').strip() movie.cover = cover movie.preview_pics = preview_pics if publish_date != '0000-00-00': # 丢弃无效的发布日期 movie.publish_date = publish_date movie.duration = duration movie.producer = producer movie.genre = genre movie.genre_id = genre_id movie.actress = actress movie.actress_pics = actress_pics
def info_summary(movie: Movie, all_info): """汇总多个来源的在线数据生成最终数据""" final_info = MovieInfo(movie) ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ########## # genre if 'javdb' in all_info: final_info.genre = all_info['javdb'].genre ########## 然后检查所有字段,如果某个字段还是默认值,则按照优先级选取数据 ########## # parser直接更新了all_info中的项目,而初始all_info是按照优先级生成的,已经符合配置的优先级顺序了 # 按照优先级取出各个爬虫获取到的信息 attrs = [i for i in dir(final_info) if not i.startswith('_')] covers, big_covers = [], [] for name, data in all_info.items(): absorbed = [] # 遍历所有属性,如果某一属性当前值为空而爬取的数据中含有该属性,则采用爬虫的属性 for attr in attrs: incoming = getattr(data, attr) if attr == 'cover': if incoming and (incoming not in covers): covers.append(incoming) absorbed.append(attr) elif attr == 'big_cover': if incoming and (incoming not in big_covers): big_covers.append(incoming) absorbed.append(attr) else: current = getattr(final_info, attr) if (not current) and (incoming): setattr(final_info, attr, incoming) absorbed.append(attr) if absorbed: logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed)) setattr(final_info, 'covers', covers) setattr(final_info, 'big_covers', big_covers) # 对cover和big_cover赋值,避免后续检查必须字段时出错 if covers: final_info.cover = covers[0] if big_covers: final_info.big_cover = big_covers[0] ########## 部分字段放在最后进行检查 ########## # title if cfg.Crawler.title__chinese_first and 'airav' in all_info: if all_info[ 'airav'].title and final_info.title != all_info['airav'].title: final_info.ori_title = final_info.title final_info.title = all_info['airav'].title # 检查是否所有必需的字段都已经获得了值 for attr in cfg.Crawler.required_keys: if not getattr(final_info, attr, None): logger.error(f"所有爬虫均未获取到字段: '{attr}',抓取失败") return False # 必需字段均已获得了值:将最终的数据附加到movie movie.info = final_info return True
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' resp = request.get(url).json() if resp['count'] == 0: barcode = search_movie(movie.dvdid) if barcode: url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW' resp = request.get(url).json() else: logger.debug(f"'{movie.dvdid}': airav无资源") return # 从API返回的数据中提取需要的字段 # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 data = resp['result'] dvdid = data['barcode'] movie.url = base_url + '/video/' + dvdid # plot和title中可能含有HTML的转义字符,需要进行解转义处理 movie.plot = unescape(data['description']) or None movie.cover = data['img_url'] # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id movie.genre = [i['name'] for i in data['tags']] movie.title = unescape(data['name']) movie.actress = [i['name'] for i in data['actors']] movie.publish_date = data['publish_date'] movie.preview_pics = data['images'] or [] if data['factories']: movie.producer = data['factories'][0]['name'] if cfg.Crawler.hardworking_mode: # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" resp = request.get(video_url).json() # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} if 'data' in resp: # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 movie.preview_video = resp['data'].get('url') # airav上部分影片会被标记为'馬賽克破壞版',这些影片的title、plot和genre都不再准确 if '馬賽克破壞版' in movie.title or (movie.plot and '馬賽克破壞版' in movie.plot): movie.title = None movie.plot = None movie.genre = None
def parse_data_raw(movie: MovieInfo, html): """解析指定番号的影片数据""" container = html.xpath("//div[@class='section product_layout_01']")[0] title = container.xpath("div/h1")[0].text_content().strip() cover = container.xpath("div/p/a[@class='sample_image']/@href")[0] # 这里使用following-sibling而不是getnext,因为getnext会获取到空格、tab等空文本 actress = container.xpath("//dt[text()='出演:']/following-sibling::dd[1]/a/text()") # 移除女优名中的空格,使女优名与其他网站保持一致 actress = [i.replace(' ', '') for i in actress] duration_str = container.xpath("//dt[text()='収録時間:']")[0].getnext().text_content() match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) date_str = container.xpath("//dt[text()='発売日:']/following-sibling::dd[1]/a/text()")[0] publish_date = date_str.replace('/', '-') producer = container.xpath("//dt[text()='メーカー名:']/following-sibling::dd[1]/a/text()")[0] dvdid = container.xpath("//dt[text()='品番:']")[0].getnext().text_content() genre_tags = container.xpath("//dt[text()='ジャンル:']/following-sibling::dd[1]/a") genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text) genre_id.append(tag.get('href').split('=')[-1]) serial = container.xpath("//dt[text()='レーベル:']/following-sibling::dd[1]/a/text()")[0] plot = container.xpath("//h2[text()='レビュー']/following-sibling::p")[0].text.strip() preview_pics = container.xpath("//li/a[@class='sample_image']/@href") # 对于2016年开始的影片,尝试获取高清封面地址(但也并不是每部影片都有,特别是2016年早期) year = int(publish_date.split('-')[0]) if year >= 2016: # 形如'/images/corner/goods/prestige/abp/647/pb_e_abp-647.jpg'的地址,移除其中的'_e'后即为高清封面 big_cover = cover.replace('_e_', '_') movie.big_cover = big_cover movie.title = title movie.cover = cover movie.actress = actress movie.publish_date = publish_date movie.producer = producer movie.genre = genre movie.genre_id = genre_id movie.serial = serial movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页 html = get_html(f'{base_url}/cn/search/{movie.dvdid}') ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()") urls = html.xpath("//a[contains(@class, 'movie-box')]/@href") ids_lower = list(map(str.lower, ids)) try: url = urls[ids_lower.index(movie.dvdid.lower())] except ValueError: # ValueError 表明找不到这部影片,直接返回 return # 提取影片信息 html = get_html(url) container = html.xpath("/html/body/div[@class='container']")[0] title = container.xpath("h3/text()")[0] cover = container.xpath("//a[@class='bigImage']/@href")[0] info = container.xpath("div/div[@class='col-md-3 info']")[0] dvdid = info.xpath("p/span[@style]/text()")[0] publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip() duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip() producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a") if producer_tag: movie.producer = producer_tag[0].text_content() serial_tag = info.xpath("p[text()='系列:']") if serial_tag: movie.serial = serial_tag[0].getnext().xpath("a/text()")[0] genre = info.xpath("p/span[@class='genre']/a/text()") actress = container.xpath("//a[@class='avatar-box']/span/text()") movie.url = url movie.title = title.replace(dvdid, '').strip() movie.cover = cover movie.publish_date = publish_date movie.duration = duration movie.genre = genre movie.actress = actress
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" url = f'{base_url}/product/product_detail/{movie.dvdid}/' resp = request.get(url) # url不存在时会被重定向至主页。history非空时说明发生了重定向 if resp.history: logger.debug(f"'{movie.dvdid}': mgstage无资源") return html = resp2html(resp) # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 title = html.xpath( "//div[@class='common_detail_cover']/h1/text()")[0].strip() container = html.xpath("//div[@class='detail_left']")[0] cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 actress_text = container.xpath( "//th[text()='出演:']/following-sibling::td/text()") actress_link = container.xpath( "//th[text()='出演:']/following-sibling::td/a/text()") actress = [i.strip() for i in actress_text + actress_link] actress = [i for i in actress if i] # 移除空字符串 producer = container.xpath( "//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() duration_str = container.xpath( "//th[text()='収録時間:']/following-sibling::td/text()")[0] match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) dvdid = container.xpath( "//th[text()='品番:']/following-sibling::td/text()")[0] date_str = container.xpath( "//th[text()='配信開始日:']/following-sibling::td/text()")[0] publish_date = date_str.replace('/', '-') serial = container.xpath( "//th[text()='シリーズ:']/following-sibling::td/a/text()")[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip() genre_tags = container.xpath( "//th[text()='ジャンル:']/following-sibling::td/a") genre = [i.text.strip() for i in genre_tags] score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() match = re.search(r'^[\.\d]+', score_str) if match: score = float(match.group()) * 2 movie.score = f'{score:.2f}' # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 plots = [] plot_p_tags = container.xpath( "//dl[@id='introduction']/dd/p[not(@class='more')]") for p in plot_p_tags: children = p.getchildren() # 没有children时表明plot不含有格式,此时简单地提取文本就可以 if not children: plots.append(p.text_content()) continue for child in children: if child.tag == 'br' and plots[-1] != '\n': plots.append('\n') else: if child.text: plots.append(child.text) if child.tail: plots.append(child.tail) plot = ''.join(plots).strip() preview_pics = container.xpath("//a[@class='sample_image']/@href") if cfg.Crawler.hardworking_mode: # 预览视频是点击按钮后再加载的,不在静态网页中 btn_url = container.xpath("//a[@class='button_sample']/@href")[0] video_pid = btn_url.split('/')[-1] req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' resp = request.get(req_url).json() video_url = resp.get('url') if video_url: # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX preview_video = video_url.split('.ism/')[0] + '.mp4' movie.preview_video = preview_video movie.url = url movie.title = title movie.cover = cover movie.actress = actress movie.producer = producer movie.publish_date = publish_date movie.serial = serial movie.genre = genre movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' html = request.get_html(url) if 'not available in your region' in html.text_content(): logger.error('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') return title = html.xpath("//h1[@id='title']/text()")[0] # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 container = html.xpath("//table[@class='mg-b12']/tr/td")[0] cover = container.xpath("//div[@id='sample-video']/a/@href")[0] # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 date_str = container.xpath( "//td[text()='配信開始日:']/following-sibling::td/text()")[0].strip() publish_date = date_str.replace('/', '-') duration_str = container.xpath( "//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 actress = container.xpath("//span[@id='performer']/a/text()") director_tag = container.xpath( "//td[text()='監督:']/following-sibling::td/a/text()") if director_tag: movie.director = director_tag[0].strip() serial_tag = container.xpath( "//td[text()='シリーズ:']/following-sibling::td/a/text()") if serial_tag: movie.serial = serial_tag[0].strip() producer_tag = container.xpath( "//td[text()='メーカー:']/following-sibling::td/a/text()") if producer_tag: movie.producer = producer_tag[0].strip() # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") # if label_tag: # label = label_tag[0].strip() # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 genre_tags = container.xpath( "//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]" ) genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text.strip()) genre_id.append(tag.get('href').split('=')[-1].strip('/')) cid = container.xpath( "//td[text()='品番:']/following-sibling::td/text()")[0].strip() plot = container.xpath("//div[@class='mg-b20 lh4']/text()")[0].strip() preview_pics = container.xpath("//a[@name='sample-image']/img/@src") score_str = container.xpath( "//p[@class='d-review__average']/strong/text()")[0].strip() match = re.search(r'\d+', score_str) if match: score = float(match.group()) * 2 movie.score = f'{score:.2f}' if cfg.Crawler.hardworking_mode: # 预览视频是动态加载的,不在静态网页中 video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}' html2 = request.get_html(video_url) # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 script = html2.xpath( "//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()" )[0].strip() match = re.search(r'\{.*\}', script) # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 try: data = json.loads(match.group()) video_url = data.get('src') if video_url and video_url.startswith('//'): video_url = 'https:' + video_url movie.preview_video = video_url except Exception as e: logger.debug('解析视频地址时异常: ' + repr(e)) movie.url = url movie.title = title movie.cover = cover movie.publish_date = publish_date movie.actress = actress movie.genre = genre movie.genre_id = genre_id movie.plot = plot movie.preview_pics = preview_pics movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
def parse_data(movie: MovieInfo): """从网页抓取并解析指定番号的数据 Args: movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 Returns: bool: True 表示解析成功,movie中携带有效数据;否则为 False """ # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}') ids = list( map( str.lower, html.xpath( "//div[@id='videos']/div/div/a/div[@class='uid']/text()"))) movie_urls = html.xpath("//div[@id='videos']/div/div/a/@href") try: new_url = movie_urls[ids.index(movie.dvdid.lower())] except ValueError: logger.debug(f'搜索结果中未找到目标影片({movie.dvdid}): ' + ', '.join(ids)) return False html = get_html_wrapper(new_url) container = html.xpath("/html/body/section/div[@class='container']")[0] info = container.xpath("div/div/div/nav")[0] title = container.xpath("h2/strong/text()")[0] cover = container.xpath("//img[@class='video-cover']/@src")[0] preview_pics = container.xpath( "//a[@class='tile-item'][@data-fancybox='gallery']/@href") preview_video_tag = container.xpath( "//video[@id='preview-video']/source/@src") if preview_video_tag: preview_video = preview_video_tag[0] if preview_video.startswith('//'): preview_video = 'https:' + preview_video movie.preview_video = preview_video dvdid = info.xpath("div/span")[0].text_content() publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text duration = info.xpath( "div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() director_tag = info.xpath("div/strong[text()='導演:']") if director_tag: movie.director = director_tag[0].getnext().text_content().strip() producer_tag = info.xpath("div/strong[text()='片商:']") if producer_tag: movie.producer = producer_tag[0].getnext().text_content().strip() publisher_tag = info.xpath("div/strong[text()='發行:']") if publisher_tag: movie.publisher = publisher_tag[0].getnext().text_content().strip() serial_tag = info.xpath("div/strong[text()='系列:']") if serial_tag: movie.serial = serial_tag[0].getnext().text score_tag = info.xpath("//span[@class='score-stars']") if score_tag: score_str = score_tag[0].tail score = re.search(r'([\d.]+)分', score_str).group(1) movie.score = "{:.2f}".format(float(score) * 2) genre_tags = info.xpath("//strong[text()='類別:']/../span/a") genre, genre_id = [], [] for tag in genre_tags: pre_id = tag.get('href').split('/')[-1] genre.append(tag.text) genre_id.append(pre_id) # 判定影片有码/无码 subsite = pre_id.split('?')[0] movie.uncensored = {'uncensored': True, 'tags': False}.get(subsite) # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] all_actors = actors_tag.xpath("a/text()") genders = actors_tag.xpath("strong/text()") actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] magnet = container.xpath("//td[@class='magnet-name']/a/@href") movie.url = new_url.replace(base_url, permanent_url) movie.title = title.replace(dvdid, '').strip() movie.cover = cover movie.preview_pics = preview_pics movie.publish_date = publish_date movie.duration = duration movie.genre = genre movie.genre_id = genre_id movie.actress = actress movie.magnet = [i.replace('[javdb.com]', '') for i in magnet] return True
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" html = post_html(f'{base_url}/search', data={'sn': movie.dvdid}) page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 cid = page_url.split('/')[-1] # /video/ipx00177 # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 if cid == 'search': return title = html.xpath("//div[@class='panel-heading']/h3/text()")[0] info = html.xpath("//div[@class='col-md-9']")[0] # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签 producer = info.xpath("a[contains(@href,'/company/')]/text()")[0] # actress, actress_pics actress, actress_pics = [], {} actress_tags = html.xpath( "//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") for tag in actress_tags: name = tag.tail.strip() pic_url = tag.get('src') actress.append(name) # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 actress_pics[name] = pic_url # genre, genre_id genre_tags = info.xpath("a[contains(@href,'/genre/')]") genre, genre_id = [], [] for tag in genre_tags: genre.append(tag.text) genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') duration_str = info.xpath("b[text()='収録時間']")[0].tail match = re.search(r'\d+', duration_str) if match: movie.duration = match.group(0) # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 score_tag = info.xpath( "//b[text()='平均評価']/following-sibling::img/@data-original") if score_tag: score = int(score_tag[0][5:7]) / 5 # /10*2 movie.score = str(score) serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") if serial_tag: movie.serial = serial_tag[0] preview_video_tag = info.xpath("//video/source/@src") if preview_video_tag: movie.preview_video = preview_video_tag[0] plot_tag = info.xpath( "//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()" ) if plot_tag: movie.plot = plot_tag[0] preview_pics = html.xpath( "//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src" ) # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 movie.url = page_url movie.cid = cid movie.title = title movie.actress = actress movie.actress_pics = actress_pics movie.producer = producer movie.genre = genre movie.genre_id = genre_id movie.publish_date = publish_date # preview_pics的第一张图始终是封面,剩下的才是预览图 movie.cover = preview_pics[0] movie.preview_pics = preview_pics[1:]
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" global base_url url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' resp = request.get(url) html = resp2html(resp) if resp.history: if urlsplit(resp.url).netloc == urlsplit(base_url).netloc: # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 new_url = resp.url else: # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段, # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据 base_url = 'https://' + urlsplit(resp.url).netloc logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}") return parse_data(movie) else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 video_tags = html.xpath("//div[@class='video'][@id]/a") # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 pre_choose = [] for tag in video_tags: tag_dvdid = tag.xpath("div[@class='id']/text()")[0] if tag_dvdid.upper() == movie.dvdid.upper(): pre_choose.append(tag) match_count = len(pre_choose) if match_count == 0: logger.debug(f"'{movie.dvdid}': 无法获取到影片结果") return elif match_count == 1: new_url = pre_choose[0].get('href') logger.debug(f"'{movie.dvdid}': 遇到多个搜索结果,已自动选择: {new_url}") elif match_count == 2: no_blueray = [] for tag in pre_choose: if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc no_blueray.append(tag) no_blueray_count = len(no_blueray) if no_blueray_count == 1: new_url = no_blueray[0].get('href') logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") else: logger.error(f"'{movie.dvdid}': 存在{match_count}个搜索结果但是均非蓝光版,为避免误处理,已全部忽略") return else: # 暂未发现有超过2个搜索结果的,保险起见还是进行检查 logger.error(f"'{movie.dvdid}': 出现{match_count}个完全匹配目标番号的搜索结果,为避免误处理,已全部忽略") return # 重新抓取网页 html = request.get_html(new_url) container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0] title_tag = container.xpath("div/h3/a/text()") title = title_tag[0] cover = container.xpath("//img[@id='video_jacket_img']/@src")[0] info = container.xpath("//div[@id='video_info']")[0] dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0] publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0] duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0] director_tag = info.xpath("//span[@class='director']/a/text()") if director_tag: movie.director = director_tag[0] producer = info.xpath("//span[@class='maker']/a/text()")[0] publisher_tag = info.xpath("//span[@class='label']/a/text()") if publisher_tag: movie.publisher = publisher_tag[0] score_tag = info.xpath("//span[@class='score']/text()") if score_tag: movie.score = score_tag[0].strip('()') genre = info.xpath("//span[@class='genre']/a/text()") actress = info.xpath("//span[@class='star']/a/text()") movie.url = new_url.replace(base_url, permanent_url) movie.title = title.replace(dvdid, '').strip() if cover.startswith('//'): # 补全URL中缺少的协议段 cover = 'https:' + cover movie.cover = cover movie.publish_date = publish_date movie.duration = duration movie.producer = producer movie.genre = genre movie.actress = actress
def parse_data(movie: MovieInfo): """解析指定番号的影片数据""" # 去除番号中的'FC2'字样 id_lc = movie.dvdid.lower() if not id_lc.startswith('fc2-'): raise ValueError('Invalid FC2 number: ' + movie.dvdid) fc2_id = id_lc.replace('fc2-', '') # 抓取网页 url = f'{base_url}/article/{fc2_id}/' html = get_html(url) try: container = html.xpath("//div[@class='items_article_left']")[0] except IndexError: logger.debug('无影片: ' + movie.dvdid) return title = container.xpath( "//div[@class='items_article_headerInfo']/h3/text()")[0] thumb_tag = container.xpath( "//div[@class='items_article_MainitemThumb']")[0] thumb_pic = thumb_tag.xpath("span/img/@src")[0] duration_str = thumb_tag.xpath( "span/p[@class='items_article_info']/text()")[0] # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商 producer = container.xpath("//li[text()='by ']/a/text()")[0] genre = container.xpath("//a[@class='tag tagTag']/text()") date_str = container.xpath( "//div[@class='items_article_Releasedate']/p/text()")[0] publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30' preview_pics = container.xpath( "//ul[@data-feed='sample-images']/li/a/@href") if cfg.Crawler.hardworking_mode: # 通过评论数据来计算准确的评分 score = get_movie_score(fc2_id) if score: movie.score = f'{score:.2f}' # 预览视频是动态加载的,不在静态网页中 desc_frame_url = container.xpath( "//section[@class='items_article_Contents']/iframe/@src")[0] key = desc_frame_url.split('=')[ -1] # /widget/article/718323/description?ac=60fc08fa... api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}' r = request_get(api_url).json() movie.preview_video = r['path'] else: # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 score_tag_attr = container.xpath( "//a[@class='items_article_Stars']/p/span/@class")[0] score = int(score_tag_attr[-1]) * 2 movie.score = f'{score:.2f}' movie.url = url movie.title = title movie.genre = genre movie.producer = producer movie.duration = str(strftime_to_minutes(duration_str)) movie.publish_date = publish_date movie.preview_pics = preview_pics # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 if movie.preview_pics: movie.cover = preview_pics[0] else: movie.cover = thumb_pic