def test_compute_subtitle_score(self): for test_sub, score in TestGuessSubtitle.test_episode_subs: self.assertEqual( compute_subtitle_score(TestGuessSubtitle.test_episode_info, test_sub), score, ) for test_sub, score in TestGuessSubtitle.test_movie_subs: self.assertEqual( compute_subtitle_score(TestGuessSubtitle.test_movie_info, test_sub), score, )
def test_compute_subtitle_score_not_match_episode(self): for test_package, score in TestGuessSubtitle.test_episode_package: self.assertEqual( compute_subtitle_score( TestGuessSubtitle.test_episode_info, test_package, match_episode=False, ), score, )
def _parse_episode_page(self, session, link, info, match_episode=True): """ compute scores for each subtitle in the episode page params: session: request.Session link: str, episode page link info: dict, result of guessit return: subs: OrderedDict, format same as get_subtitles sorted in ascending order of scores """ def _get_archive_dowload_link(sub_page_link): r = session.get(sub_page_link) bs_obj = BeautifulSoup(r.text, "html.parser") down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] down_page_link = urljoin(ZimukuDownloader.site_url, down_page_link) r = session.get(down_page_link) bs_obj = BeautifulSoup(r.text, "html.parser") download_link = bs_obj.find("a", {"rel": "nofollow"}) download_link = download_link.attrs["href"] download_link = urljoin(ZimukuDownloader.site_url, download_link) return download_link r = session.get(link) bs_obj = BeautifulSoup(r.text, "html.parser") subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody") subs = dict() for sub in subs_body.find_all("tr"): a = sub.find("a") name = extract_name(a.text, en=True) score = compute_subtitle_score(info, name, match_episode=match_episode) if score == -1: continue type_score = 0 for img in sub.find("td", class_="tac lang").find_all("img"): if "uk" in img.attrs["src"]: type_score += 1 elif "hongkong" in img.attrs["src"]: type_score += 2 elif "china" in img.attrs["src"]: type_score += 4 elif "jollyroger" in img.attrs["src"]: type_score += 8 sub_page_link = urljoin(ZimukuDownloader.site_url, a.attrs["href"]) download_link = _get_archive_dowload_link(sub_page_link) backup_session = copy.deepcopy(session) backup_session.headers["Referer"] = link # TODO: consider download times when computing scores subs[ZimukuDownloader.choice_prefix + name] = { "link": download_link, "lan": type_score, "session": backup_session, "score": score, } return subs
def get_subtitles(self, video, sub_num=10): print("Searching ZIMUKU...", end="\r") keywords = self.get_keywords(video) info_dict = video.info s = requests.session() s.headers.update(Downloader.header) sub_dict = dict() for i in range(len(keywords), 1, -1): keyword = ".".join(keywords[:i]) r = s.get(ZimukuDownloader.search_url + keyword, timeout=10) html = r.text if "搜索不到相关字幕" in html: continue bs_obj = BeautifulSoup(r.text, "html.parser") # 综合搜索页面 if bs_obj.find("div", {"class": "item"}): for item in bs_obj.find_all("div", {"class": "item"}): title_a = item.find("p", class_="tt clearfix").find("a") if info_dict["type"] == "episode": title = title_a.text season_cn1 = re.search("第(.*)季", title).group(1).strip() season_cn2 = num_to_cn(str(info_dict["season"])) if season_cn1 != season_cn2: continue episode_link = ZimukuDownloader.site_url + title_a.attrs[ "href"] new_subs = self._parse_episode_page( s, episode_link, info_dict) if not new_subs: new_subs = self._parse_episode_page( s, episode_link, info_dict, match_episode=False) sub_dict.update(new_subs) # 射手字幕页面 elif bs_obj.find("div", {"class": "persub"}): for persub in bs_obj.find_all("div", {"class": "persub"}): title = persub.h1.text.split("/")[-1] # NOTE: this will filter out all subtitle packages score = compute_subtitle_score(info_dict, title) if score == -1: continue link = ZimukuDownloader.site_url + persub.h1.a.attrs["href"] sub = self._parse_shooter_episode_page(s, title, link) sub[list(sub.keys())[0]]["score"] = score sub_dict.update(sub) else: raise ValueError("zimuku downloader needs updates") if len(sub_dict) >= sub_num: del keywords[:] break sub_dict = OrderedDict( sorted(sub_dict.items(), key=lambda e: e[1]["score"], reverse=True)) keys = list(sub_dict.keys())[:sub_num] return {key: sub_dict[key] for key in keys}