def get_subtitles(self, video_name, sub_num=5): print(prefix + ' Searching ZIMUZU...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(ZimuzuDownloader.search_url.format(keyword), headers=Downloader.header, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') tab_text = bs_obj.find('div', {'class': 'article-tab'}).text tab_text = tab_text.encode('utf8') if py == 2 else tab_text if '字幕(0)' not in tab_text: for one_box in bs_obj.find_all('div', {'class': 'search-item'}): sub_name = ZimuzuDownloader.choice_prefix + \ one_box.find('strong', {'class': 'list_title'}).text sub_name = sub_name.encode('utf8') if py == 2 else sub_name if info_dict['type'] == 'movie' and '美剧字幕' in sub_name: continue a = one_box.find('a') text = a.text.encode('utf8') if py == 2 else a.text sub_url = ZimuzuDownloader.site_url + a.attrs['href'] type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('中英' in text) * 8 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'session': None } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break # 第一个候选字幕没有双语 if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, video_name, sub_num=5): print(PREFIX + " Searching ZIMUZU...", end="\r") keywords, info_dict = Downloader.get_keywords(video_name) keyword = " ".join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get( ZimuzuDownloader.search_url.format(keyword), headers=Downloader.header, timeout=10, ) bs_obj = BeautifulSoup(r.text, "html.parser") tab_text = bs_obj.find("div", {"class": "article-tab"}).text if "字幕(0)" not in tab_text: for one_box in bs_obj.find_all("div", {"class": "search-item"}): sub_name = ( ZimuzuDownloader.choice_prefix + one_box.find("strong", {"class": "list_title"}).text ) if info_dict["type"] == "movie" and "美剧字幕" in sub_name: continue a = one_box.find("a") text = a.text sub_url = ZimuzuDownloader.site_url + a.attrs["href"] type_score = 0 type_score += ("英文" in text) * 1 type_score += ("繁体" in text) * 2 type_score += ("简体" in text) * 4 type_score += ("中英" in text) * 8 sub_dict[sub_name] = { "lan": type_score, "link": sub_url, "session": None, } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], "") keywords.pop(-1) continue break # 第一个候选字幕没有双语 if len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]["lan"] < 8: sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=True) ) return sub_dict
def test_get_keywords(self): """ Test video info extracting """ names = ( 'Show.S01E01.ShowName.1080p.AMZN.WEB-DL.DDP5.1.H.264-GRP.mkv', 'Hanzawa.Naoki.Ep10.Final.Chi_Jap.BDrip.1280X720-ZhuixinFan.mp4', 'Homeland.S02E12.PROPER.720p.HDTV.x264-EVOLVE.mkv', 'La.La.Land.2016.1080p.BluRay.x264.Atmos.TrueHD.7.1-HDChina.mkv') results = (['Show%20s01', 'e01', 'Web', 'GRP', 'amzn', '1080p'], [ 'Hanzawa%20Naoki', 'e10', 'Bluray', 'ZhuixinFan', '720p' ], ['Homeland%20s02', 'e12', 'HDTV', 'EVOLVE', '720p'], ['La%20La%20Land', '2016', 'Bluray', 'HDChina', '1080p']) for n, r in zip(names, results): self.assertEqual(Downloader.get_keywords(n)[0], r)
def test_get_keywords(self): """ Test video info extracting """ names = ( "Show.S01E01.ShowName.1080p.AMZN.WEB-DL.DDP5.1.H.264-GRP.mkv", "Hanzawa.Naoki.Ep10.Final.Chi_Jap.BDrip.1280X720-ZhuixinFan.mp4", "Homeland.S02E12.PROPER.720p.HDTV.x264-EVOLVE.mkv", "La.La.Land.2016.1080p.BluRay.x264.Atmos.TrueHD.7.1-HDChina.mkv", ) results = ( ["Show", "s01", "e01", "Web", "GRP", "amzn", "1080p"], ["Hanzawa%20Naoki", "e10", "Bluray", "ZhuixinFan", "720p"], ["Homeland", "s02", "e12", "HDTV", "EVOLVE", "720p"], ["La%20La%20Land", "2016", "Bluray", "HDChina", "1080p"], ) for n, r in zip(names, results): self.assertEqual(Downloader.get_keywords(n)[0], r)
def get_subtitles(self, video_name, sub_num=5): print("Searching SUBHD...", end="\r") keywords, info_dict = Downloader.get_keywords(video_name) keyword = " ".join(keywords) sub_dict = order_dict() s = requests.session() s.headers.update(Downloader.header) while True: # 当前关键字查询 r = s.get( SubHDDownloader.search_url + keyword, timeout=10, ) bs_obj = BeautifulSoup(r.text, "html.parser") try: small_text = bs_obj.find("small").text except AttributeError: char_error = "The URI you submitted has disallowed characters" if char_error in bs_obj.text: print("[SUBHD ERROR] " + char_error + ": " + keyword) return sub_dict # 搜索验证按钮 time.sleep(2) continue if "总共 0 条" not in small_text: results = bs_obj.find_all( "div", class_="mb-4 bg-white rounded shadow-sm") for one_box in results: if info_dict["type"] == "movie" and not one_box.find( "div", class_="px-1 rounded-sm bg-danger text-white"): continue a = one_box.find("div", class_="f12 pt-1").find("a") sub_url = SubHDDownloader.site_url + a.attrs["href"] sub_name = SubHDDownloader.choice_prefix + a.text text = one_box.text if "/a" in a.attrs["href"]: type_score = 0 type_score += ("英文" in text) * 1 type_score += ("繁体" in text) * 2 type_score += ("简体" in text) * 4 type_score += ("双语" in text) * 8 sub_dict[sub_name] = { "lan": type_score, "link": sub_url, "session": None, } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], "") keywords.pop(-1) continue break if len(sub_dict.items()) > 0 and list( sub_dict.items())[0][1]["lan"] < 8: # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=True)) return sub_dict
def get_subtitles(self, video_name, sub_num=5): print(prefix + ' Searching SUBHD...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(SubHDDownloader.search_url + keyword, headers=Downloader.header, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') try: small_text = bs_obj.find('small').text except AttributeError as e: char_error = 'The URI you submitted has disallowed characters' if char_error in bs_obj.text: print(prefix + ' [SUBHD ERROR] ' + char_error + ': ' + keyword) return sub_dict # 搜索验证按钮 time.sleep(2) continue if "总共 0 条" not in small_text: results = bs_obj.find_all( "div", class_="mb-4 bg-white rounded shadow-sm") for one_box in results: if info_dict['type'] == 'movie' \ and not one_box.find('div', class_="px-1 rounded-sm bg-danger text-white"): continue a = one_box.find('div', class_="f12 pt-1").find('a') sub_url = SubHDDownloader.site_url + a.attrs['href'] sub_name = SubHDDownloader.choice_prefix + a.text text = one_box.text if '/a' in a.attrs['href']: type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('双语' in text) * 8 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'session': None } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, video_name, sub_num=5): print(prefix + ' Searching SUBHD...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) sub_dict = order_dict() s = requests.session() while True: # 当前关键字查询 r = s.get(SubHDDownloader.search_url + keyword, headers=Downloader.header, timeout=10) bs_obj = BeautifulSoup(r.text, 'html.parser') try: if py == 2: small_text = bs_obj.find('small').text.encode('utf8') else: small_text = bs_obj.find('small').text except AttributeError as e: char_error = 'The URI you submitted has disallowed characters' if char_error in bs_obj.text: print(prefix + ' [SUBHD ERROR] ' + char_error + ': ' + keyword) return sub_dict # 搜索验证按钮 time.sleep(2) continue if '总共 0 条' not in small_text: for one_box in bs_obj.find_all('div', {'class': 'box'}): if info_dict['type'] == 'movie' \ and not one_box.find('div', {'class': 'movielist'}): continue a = one_box.find('div', {'class': 'd_title'}).find('a') sub_url = SubHDDownloader.site_url + a.attrs['href'] sub_name = SubHDDownloader.choice_prefix + a.text.encode('utf8') if py == 2 \ else SubHDDownloader.choice_prefix + a.text if py == 2: text = one_box.text.encode('utf8') else: text = one_box.text if '/ar' in a.attrs['href']: type_score = 0 type_score += ('英文' in text) * 1 type_score += ('繁体' in text) * 2 type_score += ('简体' in text) * 4 type_score += ('双语' in text) * 8 # no_dot_text=text.replace('.',' ').lower() # for qkeyword in keywords: # if no_dot_text.find(qkeyword.strip().lower()) != -1: # type_score += 3 sub_dict[sub_name] = { 'lan': type_score, 'link': sub_url, 'session': None } if len(sub_dict) >= sub_num: del keywords[:] # 字幕条数达到上限,清空keywords break if len(keywords) > 1: # 字幕数未满,更换关键词继续查询 keyword = keyword.replace(keywords[-1], '') keywords.pop(-1) continue break if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True)) return sub_dict
def get_subtitles(self, video_name, sub_num=10): print(prefix + ' Searching ZIMUKU...', end='\r') keywords, info_dict = Downloader.get_keywords(video_name) keyword = ' '.join(keywords) info = guessit(keyword) keywords.pop(0) keywords.insert(0, info['title']) if info.get('season'): season = str(info['season']).zfill(2) keywords.insert(1, 's' + season) sub_dict = order_dict() s = requests.session() s.headers.update(Downloader.header) while True: # 当前关键字搜索 r = s.get(ZimukuDownloader.search_url + keyword, timeout=10) html = r.text if '搜索不到相关字幕' not in html: bs_obj = BeautifulSoup(r.text, 'html.parser') if bs_obj.find('div', {'class': 'item'}): # 综合搜索页面 for item in bs_obj.find_all('div', {'class': 'item'}): title_boxes = item.find( 'div', {'class': 'title'}).find_all('p') title_box = title_boxes[0] sub_title_box = title_boxes[1] item_title = title_box.text item_sub_title = sub_title_box.text item_info = guessit(item_title) if info.get('year') and item_info.get('year'): if info['year'] != item_info['year']: # 年份不匹配,跳过 continue item_titles = [ item_info.get('title', '').lower(), item_info.get('alternative_title', '').lower() ] + item_sub_title.lower().strip().split(',') title_included = sum([ 1 for _ in item_sub_title if info['title'].lower() not in _ ]) if title_included == 0: # guessit抽取标题不匹配,跳过 item_title_split = \ [one.split() for one in item_titles] info_title_split = info['title'].lower().split() sum1 = sum([1 for _ in info_title_split if _ in item_title_split[0]]) sum2 = sum([1 for _ in info_title_split if _ in item_title_split[1]]) if not (sum1 / len(info_title_split) >= 0.5 or sum2 / len(info_title_split) >= 0.5): # 标题不匹配,跳过 continue for a in item.find_all('td', {'class': 'first'})[:3]: a = a.a a_link = ZimukuDownloader.site_url + \ a.attrs['href'] a_title = a.text a_title = ZimukuDownloader.choice_prefix + a_title sub_dict[a_title] = {'type': 'default', 'link': a_link} elif bs_obj.find('div', {'class': 'persub'}): # 射手字幕页面 for persub in bs_obj.find_all('div', {'class': 'persub'}): a_title = persub.h1.text a_link = ZimukuDownloader.site_url + \ persub.h1.a.attrs['href'] a_title = ZimukuDownloader.choice_prefix + a_title sub_dict[a_title] = {'type': 'shooter', 'link': a_link} else: raise ValueError('Zimuku搜索结果出现未知结构页面') if len(sub_dict) >= sub_num: del keywords[:] break if len(keywords) > 1: keyword = keyword.replace(keywords[-1], '').strip() keywords.pop(-1) continue break for sub_name, sub_info in sub_dict.items(): if sub_info['type'] == 'default': # 综合搜索字幕页面 r = s.get(sub_info['link'], timeout=60) bs_obj = BeautifulSoup(r.text, 'html.parser') lang_box = bs_obj.find('ul', {'class': 'subinfo'}).find('li') type_score = 0 for lang in lang_box.find_all('img'): if 'uk' in lang.attrs['src']: type_score += 1 elif 'hongkong' in lang.attrs['src']: type_score += 2 elif 'china' in lang.attrs['src']: type_score += 4 elif 'jollyroger' in lang.attrs['src']: type_score += 8 sub_info['lan'] = type_score download_link = bs_obj.find('a', {'id': 'down1'}).attrs['href'] download_link = urljoin( ZimukuDownloader.site_url, download_link) r = s.get(download_link, timeout=60) bs_obj = BeautifulSoup(r.text, 'html.parser') download_link = bs_obj.find('a', {'rel': 'nofollow'}) download_link = download_link.attrs['href'] download_link = urljoin( ZimukuDownloader.site_url, download_link) sub_info['link'] = download_link else: # 射手字幕页面 r = s.get(sub_info['link'], timeout=60) bs_obj = BeautifulSoup(r.text, 'html.parser') lang_box = bs_obj.find('ul', {'class': 'subinfo'}).find('li') type_score = 0 text = lang_box.text if '英' in text: type_score += 1 elif '繁' in text: type_score += 2 elif '简' in text: type_score += 4 elif '双语' in text: type_score += 8 sub_info['lan'] = type_score download_link = bs_obj.find('a', {'id': 'down1'}).attrs['href'] sub_info['link'] = download_link backup_session = requests.session() backup_session.headers.update(s.headers) backup_session.headers['Referer'] = sub_info['link'] backup_session.cookies.update(s.cookies) sub_info['session'] = backup_session if (len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]['lan'] < 8): # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]['lan'], reverse=True) ) keys = list(sub_dict.keys())[:sub_num] return {key: sub_dict[key] for key in keys}
def get_subtitles(self, video_name, sub_num=10): print(PREFIX + " Searching ZIMUKU...", end="\r") keywords, info_dict = Downloader.get_keywords(video_name) keyword = " ".join(keywords) info = guessit(keyword) keywords.pop(0) keywords.insert(0, info["title"]) if info.get("season"): season = str(info["season"]).zfill(2) keywords.insert(1, "s" + season) sub_dict = order_dict() s = requests.session() s.headers.update(Downloader.header) while True: # 当前关键字搜索 r = s.get(ZimukuDownloader.search_url + keyword, timeout=10) html = r.text if "搜索不到相关字幕" not in html: bs_obj = BeautifulSoup(r.text, "html.parser") if bs_obj.find("div", {"class": "item"}): # 综合搜索页面 for item in bs_obj.find_all("div", {"class": "item"}): title_boxes = item.find("div", {"class": "title"}).find_all("p") title_box = title_boxes[0] sub_title_box = title_boxes[1] item_title = title_box.text item_sub_title = sub_title_box.text item_info = guessit(item_title) if info.get("year") and item_info.get("year"): if info["year"] != item_info["year"]: # 年份不匹配,跳过 continue item_titles = [ item_info.get("title", "").lower(), item_info.get("alternative_title", "").lower(), ] + item_sub_title.lower().strip().split(",") title_included = sum( [ 1 for _ in item_sub_title if info["title"].lower() not in _ ] ) if title_included == 0: # guessit抽取标题不匹配,跳过 item_title_split = [one.split() for one in item_titles] info_title_split = info["title"].lower().split() sum1 = sum( [ 1 for _ in info_title_split if _ in item_title_split[0] ] ) sum2 = sum( [ 1 for _ in info_title_split if _ in item_title_split[1] ] ) if not ( sum1 / len(info_title_split) >= 0.5 or sum2 / len(info_title_split) >= 0.5 ): # 标题不匹配,跳过 continue for a in item.find_all("td", {"class": "first"})[:3]: a = a.a a_link = ZimukuDownloader.site_url + a.attrs["href"] a_title = a.text a_title = ZimukuDownloader.choice_prefix + a_title sub_dict[a_title] = {"type": "default", "link": a_link} elif bs_obj.find("div", {"class": "persub"}): # 射手字幕页面 for persub in bs_obj.find_all("div", {"class": "persub"}): a_title = persub.h1.text a_link = ZimukuDownloader.site_url + persub.h1.a.attrs["href"] a_title = ZimukuDownloader.choice_prefix + a_title sub_dict[a_title] = {"type": "shooter", "link": a_link} else: raise ValueError("Zimuku搜索结果出现未知结构页面") if len(sub_dict) >= sub_num: del keywords[:] break if len(keywords) > 1: keyword = keyword.replace(keywords[-1], "").strip() keywords.pop(-1) continue break for sub_name, sub_info in sub_dict.items(): if sub_info["type"] == "default": # 综合搜索字幕页面 r = s.get(sub_info["link"], timeout=60) bs_obj = BeautifulSoup(r.text, "html.parser") lang_box = bs_obj.find("ul", {"class": "subinfo"}).find("li") type_score = 0 for lang in lang_box.find_all("img"): if "uk" in lang.attrs["src"]: type_score += 1 elif "hongkong" in lang.attrs["src"]: type_score += 2 elif "china" in lang.attrs["src"]: type_score += 4 elif "jollyroger" in lang.attrs["src"]: type_score += 8 sub_info["lan"] = type_score download_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] download_link = urljoin(ZimukuDownloader.site_url, download_link) r = s.get(download_link, timeout=60) bs_obj = BeautifulSoup(r.text, "html.parser") download_link = bs_obj.find("a", {"rel": "nofollow"}) download_link = download_link.attrs["href"] download_link = urljoin(ZimukuDownloader.site_url, download_link) sub_info["link"] = download_link else: # 射手字幕页面 r = s.get(sub_info["link"], timeout=60) bs_obj = BeautifulSoup(r.text, "html.parser") lang_box = bs_obj.find("ul", {"class": "subinfo"}).find("li") type_score = 0 text = lang_box.text if "英" in text: type_score += 1 elif "繁" in text: type_score += 2 elif "简" in text: type_score += 4 elif "双语" in text: type_score += 8 sub_info["lan"] = type_score download_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] sub_info["link"] = download_link backup_session = requests.session() backup_session.headers.update(s.headers) backup_session.headers["Referer"] = sub_info["link"] backup_session.cookies.update(s.cookies) sub_info["session"] = backup_session if len(sub_dict.items()) > 0 and list(sub_dict.items())[0][1]["lan"] < 8: # 第一个候选字幕没有双语 sub_dict = order_dict( sorted(sub_dict.items(), key=lambda e: e[1]["lan"], reverse=True) ) keys = list(sub_dict.keys())[:sub_num] return {key: sub_dict[key] for key in keys}