def parse_results_page(self, anime_page, anime_english): try: entries = anime_page.find("table", {"class": "video_lst_v"}).find_all("table", {"class": "video_cell"}) except: tools.catch() return {} results = {} for e in entries: try: url = e.find("div", {"class": "search_name"}).find("a") episode_num = self.get_episode_num(url.text, anime_english) print(url.text) if not episode_num: continue if not episode_num in results: results[episode_num] = [] results[episode_num].append((url.find("span").get("title"), url.get("href"))) except: tools.catch() continue return results
def browser_open(self, url, method="GET", data={}, retry_count=5, retry_delay=5): error_reason = "" for retry in range(retry_count): try: return self.handle_method(url, method, data) except Exception as e: error_reason = str(e.reason) catch("url=%s " % url + error_reason) time.sleep(retry_delay) raise RuntimeError("Unable to open URL %s after %d retries" % (url, retry_count))
def search_anime(self, anime_english, anime_aliases=[], type_=""): names = [anime_english] + anime_aliases if anime_english in misc.ALIASES: names += misc.ALIASES[anime_english] found = False for anime_name in names: page_name = "%s.html" % anime_name page_data = self.load_page(page_name) print(anime_name) if not page_data: print("!page_data") try: built_url = self.build_search_url(anime_name) res = self.browser_open(built_url) except RuntimeError: tools.catch() continue resp = res.read() if not resp: print("!resp") continue anime_page_url = self._find_best_match(resp, anime_names=names) print("search: anime_page_url=%s" % anime_page_url) if not anime_page_url: print("!anime_page_url") continue try: #print(anime_page_url) res = self.browser_open(anime_page_url) except RuntimeError: tools.catch() continue page_data = res.get_data() self.save_page(page_name, page_data) found = True break if not found: return None return page_data
def search_anime(self, anime_english, anime_aliases = [], type_ = ""): names = [anime_english] if anime_english in misc.ALIASES: names += misc.ALIASES[anime_english] found = False for anime_name in names: page_name = "%s.html" % anime_name page_data = self.load_page(page_name) print(anime_name) print(names) if not page_data: try: built_url = self.build_search_url(anime_name) res = self.browser_open(built_url) except RuntimeError: tools.catch() return self.handler_resource_is_unavailable() page_data = res.get_data() redir_url = res.geturl() # not found html = BeautifulSoup(page_data, features = "html5lib") p = html.find("div", {"class": "content"}).find("p", {"style": "margin-top:20px; text-align:center;"}) if p and p.text == "По Вашему запросу ничего не найдено": print("not found") continue if not page_data: print("not page_data") continue self.save_page(page_name, page_data) found = True self.anime_aliases[anime_english] = anime_name break if not found: return None return page_data
def parse_anime_page(self, anime_english, type_ = ""): anime_page = self.search_anime(anime_english, type_) if not anime_page: print("parse_anime_page: not found") return self.handler_anime_not_found(anime_english) content_main = BeautifulSoup(anime_page, features = "html5lib") authors = "[Anilibria]" release_info = content_main.find("div", {"id": "xreleaseInfo"}) if release_info: release_info = list(release_info.strings) if 'Озвучка:' in release_info: try: dubbers = release_info[release_info.index('Озвучка:') + 1].lstrip() dubbers = " & ".join(dubbers.split(", ")) authors += "(%s)" % dubbers except IndexError: tools.catch() videos = {} videos_start_idx = anime_page.find(b"new Playerjs(") if videos_start_idx < 0: return authors, videos videos_end_idx = anime_page.find(b"});", videos_start_idx) if videos_end_idx < 0: return authors, videos videos_unparsed = anime_page[videos_start_idx + len(b"new Playerjs("): videos_end_idx + 1] try: videos = {int(f["id"].split("s")[-1]): f["file"] for f in demjson.decode(videos_unparsed)["file"]} except: tools.catch() return authors, videos
def _find_best_match(self, resp, anime_names): page = BeautifulSoup(resp, features="html5lib") urls = [article.find("a") for article in page.find_all("article")] if not urls: return try: results = { a.get("title").split(",")[0]: a.get("href") for a in urls for a in urls } except Exception as e: tools.catch(str(e.reason)) return print(results) best_score = 0 best_result = None print("_find_best_match: names: %s" % str(anime_names)) for name in anime_names: print("_find_best_match: name: %s" % name) for k, v in results.items(): score = fuzz.ratio(name, k) if score > best_score: best_score = score best_result = v print("%s: score=%d" % (best_result, best_score)) if not best_result: return if best_score < self.name_match_threshold: print("%s has score %d, rejecting" % (str(best_result), best_score)) best_result = None return best_result
def get_videos_list(self, anime_english, episode_num, type_ = ""): existing_video = AnimeVideo.query.filter(AnimeVideo.anime_english == anime_english, AnimeVideo.episode == episode_num, AnimeVideo.url.like("%libria%")).first() if existing_video: return self.handler_epidode_exists(anime_english, episode_num, existing_video.url) try: obj = self.parse_anime_page(anime_english, type_) authors, videos = obj except: #print("parse_anime_page returned %s" % str(obj)) raise if not episode_num in videos: return self.handler_epidode_not_found(anime_english, episode_num) if not authors: return self.handler_authors_not_found(anime_english) videos_list = pd.DataFrame(columns = ["url", "episode", "kind", "quality", "video_hosting", "language", "author"]) video_url = videos[episode_num] quality = "unknown" try: quality = self.get_quality(video_url) except: tools.catch() videos_list = videos_list.append({ "url": self.url_to_embed(video_url), "episode": str(episode_num), "video_hosting": self.netloc, "author": authors, "quality": quality, "language": "russian", "kind": self.to_db_kind["fandub"] }, ignore_index = True) return videos_list
def find_animes(parsers=OrderedDict([("anilibria", anilibria.AnilibriaParser), ("smotretanime", anime365.Anime365Parser), ("sovetromantica", sovetromantica.SRParser), ("sibnet", sibnet.SibnetParser)]), anime_ids=[], media_kind=MEDIA_KIND_VIDEOS, fetch_only_ongoings=True, fetch_all_episodes=False, filter_by_unique_url=True, use_anime_aliases=True): if not anime_ids: ongoings.main() if fetch_only_ongoings: anime_ids = ongoings.ONGOING_IDS else: print(misc.MANUALLY_TRACKED_IDS) anime_ids = ongoings.ONGOING_IDS + misc.MANUALLY_TRACKED_IDS result = pd.DataFrame() for hosting, Parser in parsers.items(): print("hosting: " + hosting) parser = Parser() if not parser.is_media_kind_supported(media_kind): print("parser doesn't support media kind %s" % media_kind) continue else: print("fetching media kind %s" % media_kind) total = len(anime_ids) for n, anime_id in enumerate(anime_ids, start=1): note = "found" shiki_ongoing_data = {} try: anime_info = routes.get_anime_info(anime_id) except: if not (fetch_only_ongoings or id in ongoings.ONGOING_IDS): note = "not found" print("[%d / %d]: %s" % (n, total, note)) continue catch() try: shiki_ongoing_data = ongoings.parse_ongoing( ongoings.get_ongoing_html(anime_id)) if not shiki_ongoing_data: continue except: catch() continue if not shiki_ongoing_data[ "anime_russian"] or not shiki_ongoing_data[ "anime_english"]: note = "not found in database and couldn't retrieve anime names, skipping" print("[%d / %d]: %s: %s" % (n, total, anime_info["anime_english"], note)) continue note = "not found in database, will create first entries" anime_info = { "anime_english": shiki_ongoing_data["anime_english"], "anime_russian": shiki_ongoing_data["anime_russian"], "duration": 0 } print("[%d / %d] %s: %s" % (n, total, anime_info["anime_english"], note)) if anime_info["anime_english"] in misc.SKIP: note = "anime was explicitly specified to skip fetch" print("[%d / %d] %s: %s" % (n, total, anime_info["anime_english"], note)) continue search_kwargs = {} if fetch_only_ongoings or id in ongoings.ONGOING_IDS: if not shiki_ongoing_data: try: shiki_ongoing_data = ongoings.parse_ongoing( ongoings.get_ongoing_html(anime_id)) except: catch() continue if not shiki_ongoing_data: continue if shiki_ongoing_data["type"]: #print("type: %s" % shiki_ongoing_data["type"]) search_kwargs["type_"] = shiki_ongoing_data["type"] if use_anime_aliases: search_kwargs["anime_aliases"] = [] if anime_info["anime_russian"]: search_kwargs["anime_aliases"] = [ anime_info["anime_russian"] ] in_forced_list = False if hosting in misc.FORCE_ALIASES: in_forced_list = anime_info[ "anime_english"] in misc.FORCE_ALIASES[hosting] #print("%s: in forced list: %d: %s" %(anime_info["anime_english"], in_forced_list, str(misc.FORCE_ALIASES[hosting]))) #if in_forced_list: # aliases = misc.FORCE_ALIASES[hosting][anime_info["anime_english"]] # for a in aliases: # if not a.endswith(".html"): # anime_info["anime_english"] = a # break #if (hosting in misc.FORCE_ALIASES) and (anime_info["anime_english"] in misc.FORCE_ALIASES[hosting]): # aliases = misc.FORCE_ALIASES[hosting][anime_info["anime_english"]] # #print("%s: forcing name '%s' because found in FORCE_ALIASES" % (anime_info["anime_english"], forced_name)) # #anime_info["anime_english"] = forced_name # for a in aliases: # if not a.endswith(".html"): # search_kwargs["anime_aliases"] = a # break if not parser.search_anime(anime_info["anime_english"], ** search_kwargs): note = "not found" print("[%d / %d] %s: %s" % (n, total, anime_info["anime_english"], note)) continue print("[%d / %d] %s: %s" % (n, total, anime_info["anime_english"], note)) if media_kind == MEDIA_KIND_VIDEOS: tmp_videos_list = get_videos_list(parser, n, total, anime_id, hosting, anime_info, shiki_ongoing_data, fetch_only_ongoings, fetch_all_episodes, filter_by_unique_url) if (isinstance(tmp_videos_list, type(None))) or tmp_videos_list.empty: continue result = merge_search_results(result, tmp_videos_list) return result
def get_videos_list(self, anime_english, episode_num, type_=""): episodes = [] anime_page = self.search_anime(anime_english, type_) if not anime_page: return self.handler_anime_not_found(anime_english) content_main = BeautifulSoup(anime_page, features="html5lib") episodes += [ a.get("href") for a in content_main.find_all( "a", {"class": "episodeButtonDownload"}) ] if not episodes: return self.handler_episodes_list_not_found(anime_english) nav = {} try: nav = { a.text: a.get("href") for a in content_main.find("div", { "class": "episode_info" }).find("nav").find_all("a") } except: tools.catch() if "Озвучка" in nav: url = self.build_url(path=nav["Озвучка"]) page_name = os.path.join(anime_english, "fandub.html") #print(page_name, url) anime_page = self.load_or_save_page(page_name, url) content = BeautifulSoup(anime_page, features="html5lib") episodes += [ a.get("href") for a in content.find_all( "a", {"class": "episodeButtonDownload"}) ] videos_list = pd.DataFrame(columns=[ "url", "episode", "kind", "quality", "video_hosting", "language", "author" ]) anime_id = self.get_anime_id(episodes[0]) if not anime_id.isdigit(): return anime_id = int(anime_id) quality = "unknown" author = "unknown" for url in episodes: episode_num_ = self.get_episode_num(url) if not episode_num_ or not (episode_num_.isdigit()): continue episode_num_ = int(episode_num_) if episode_num_ != episode_num: continue kind = self.get_anime_kind(url) shiki_kind = self.to_shiki_kind[kind] all_authors = content.find_all("div", {"class": "anime-team"}) if shiki_kind == "subtitles": try: author = " & ".join( [s for s in all_authors[0].strings if not s.isspace()]) print(author) except: pass tools.catch() try: all_authors = [[ i for i in list(d.strings) if not i.isspace() ] for d in content_main.find_all( "div", {"class": "anime-team"})][0] author = " & ".join(all_authors) print(author) except: pass tools.catch() try: if shiki_kind == "fandub": author = " & ".join( [s for s in all_authors[1].strings if not s.isspace()]) print(author) except (IndexError, AttributeError): tools.catch() continue #print(url, kind) videos_list = videos_list.append( { "url": self.url_to_embed(url, kind), "episode": str(episode_num_), "video_hosting": self.netloc, "author": author, "quality": quality, "language": "russian", "kind": self.to_db_kind[shiki_kind] }, ignore_index=True) return videos_list
def search_anime(self, anime_english, anime_aliases = [], type_ = ""): names = [anime_english] anime_page_url = "" #print("%s aliases = %s" % (anime_english, misc.FORCE_ALIASES["anilibria"][anime_english])) if anime_english in misc.FORCE_ALIASES["anilibria"]: for a in misc.FORCE_ALIASES["anilibria"][anime_english]: if not a.endswith(".html"): names += [a] else: print("release = %s" % a) anime_page_url = a found = (anime_page_url != "") if anime_page_url: try: #print(anime_page_url) res = self.browser_open(self.build_url(path = anime_page_url)) except RuntimeError: tools.catch() page_name = "%s.html" % names[0] page_data = res.get_data() self.save_page(page_name, page_data) return page_data print("anilibria: search_anime: anime_english=%s, anime_aliases=%s, names=%s" % (anime_english, str(anime_aliases), str(names))) for anime_name in names: page_name = "%s.html" % anime_name page_data = self.load_page(page_name) print(anime_name) if not page_data: print("!page_data") try: built_url, query_kwargs = self.build_search_url(anime_name, method = "POST") res = self.browser_open(built_url, method = "POST", data = query_kwargs) except RuntimeError: tools.catch() continue resp = res.read() if not resp: self.save_page(page_name, b"") print("!resp") continue try: resp = demjson.decode(resp) except: tools.catch() continue if not "err" in resp or not "mes" in resp or resp["err"] != "ok": print("resp is not ok") continue resp_data = resp["mes"] if not resp_data and not anime_page_url: print("!resp_data") continue if not anime_page_url: anime_page_url = self._find_best_match(resp_data, anime_names = anime_aliases) print("search: anime_page_url=%s" % anime_page_url) if not anime_page_url: print("!anime_page_url") continue try: #print(anime_page_url) res = self.browser_open(self.build_url(path = anime_page_url)) except RuntimeError: tools.catch() continue found = True break if not found: return None page_data = res.get_data() self.save_page(page_name, page_data) return page_data