コード例 #1
0
	def parse_results_page(self, anime_page, anime_english):
		try:
			entries = anime_page.find("table", {"class": "video_lst_v"}).find_all("table", {"class": "video_cell"})
		except:
			tools.catch()
			return {}

		results = {}

		for e in entries:
			try:
				url = e.find("div", {"class": "search_name"}).find("a")
				episode_num = self.get_episode_num(url.text, anime_english)

				print(url.text)
				if not episode_num:
					continue

				if not episode_num in results:
					results[episode_num] = []

				results[episode_num].append((url.find("span").get("title"), url.get("href")))
			except:
				tools.catch()
				continue

		return results
コード例 #2
0
ファイル: parser.py プロジェクト: PlayShikiApp/parsers
 def browser_open(self,
                  url,
                  method="GET",
                  data={},
                  retry_count=5,
                  retry_delay=5):
     error_reason = ""
     for retry in range(retry_count):
         try:
             return self.handle_method(url, method, data)
         except Exception as e:
             error_reason = str(e.reason)
             catch("url=%s " % url + error_reason)
         time.sleep(retry_delay)
     raise RuntimeError("Unable to open URL %s after %d retries" %
                        (url, retry_count))
コード例 #3
0
    def search_anime(self, anime_english, anime_aliases=[], type_=""):
        names = [anime_english] + anime_aliases

        if anime_english in misc.ALIASES:
            names += misc.ALIASES[anime_english]

        found = False
        for anime_name in names:
            page_name = "%s.html" % anime_name
            page_data = self.load_page(page_name)
            print(anime_name)
            if not page_data:
                print("!page_data")
                try:
                    built_url = self.build_search_url(anime_name)
                    res = self.browser_open(built_url)
                except RuntimeError:
                    tools.catch()
                    continue
                resp = res.read()

                if not resp:
                    print("!resp")
                    continue

                anime_page_url = self._find_best_match(resp, anime_names=names)
                print("search: anime_page_url=%s" % anime_page_url)
                if not anime_page_url:
                    print("!anime_page_url")
                    continue
                try:
                    #print(anime_page_url)
                    res = self.browser_open(anime_page_url)
                except RuntimeError:
                    tools.catch()
                    continue

                page_data = res.get_data()

            self.save_page(page_name, page_data)
            found = True
            break

        if not found:
            return None

        return page_data
コード例 #4
0
	def search_anime(self, anime_english, anime_aliases = [], type_ = ""):
		names = [anime_english]

		if anime_english in misc.ALIASES:
			names += misc.ALIASES[anime_english]

		found = False
		for anime_name in names:
			page_name = "%s.html" % anime_name
			page_data = self.load_page(page_name)
			print(anime_name)
			print(names)
			if not page_data:
				try:
					built_url = self.build_search_url(anime_name)
					res = self.browser_open(built_url)
				except RuntimeError:
					tools.catch()
					return self.handler_resource_is_unavailable()
				page_data = res.get_data()
				redir_url = res.geturl()

			# not found
			html = BeautifulSoup(page_data, features = "html5lib")
			p = html.find("div", {"class": "content"}).find("p", {"style": "margin-top:20px; text-align:center;"})
			if p and p.text == "По Вашему запросу ничего не найдено":
				print("not found")
				continue

			if not page_data:
				print("not page_data")
				continue
			self.save_page(page_name, page_data)
			found = True
			self.anime_aliases[anime_english] = anime_name
			break

		if not found:
			return None

		return page_data
コード例 #5
0
ファイル: anilibria.py プロジェクト: PlayShikiApp/parsers
	def parse_anime_page(self, anime_english, type_ = ""):
		anime_page = self.search_anime(anime_english, type_)
		if not anime_page:
			print("parse_anime_page: not found")
			return self.handler_anime_not_found(anime_english)

		content_main = BeautifulSoup(anime_page, features = "html5lib")
		
		authors = "[Anilibria]"
		release_info = content_main.find("div", {"id": "xreleaseInfo"})
		if release_info:
			release_info = list(release_info.strings)
			if 'Озвучка:' in release_info:
				try:
					dubbers = release_info[release_info.index('Озвучка:') + 1].lstrip()
					dubbers = " & ".join(dubbers.split(", "))
					authors += "(%s)" % dubbers

				except IndexError:
					tools.catch()
		

		videos = {}
		videos_start_idx = anime_page.find(b"new Playerjs(")
		if videos_start_idx < 0:
			return authors, videos

		videos_end_idx = anime_page.find(b"});", videos_start_idx)

		if videos_end_idx < 0:
			return authors, videos

		videos_unparsed = anime_page[videos_start_idx + len(b"new Playerjs("): videos_end_idx + 1]

		try:
			videos = {int(f["id"].split("s")[-1]): f["file"] for f in demjson.decode(videos_unparsed)["file"]}
		except:
			tools.catch()

		return authors, videos
コード例 #6
0
    def _find_best_match(self, resp, anime_names):
        page = BeautifulSoup(resp, features="html5lib")
        urls = [article.find("a") for article in page.find_all("article")]
        if not urls:
            return

        try:
            results = {
                a.get("title").split(",")[0]: a.get("href")
                for a in urls for a in urls
            }
        except Exception as e:
            tools.catch(str(e.reason))
            return

        print(results)
        best_score = 0
        best_result = None
        print("_find_best_match: names: %s" % str(anime_names))

        for name in anime_names:
            print("_find_best_match: name: %s" % name)

            for k, v in results.items():
                score = fuzz.ratio(name, k)
                if score > best_score:
                    best_score = score
                    best_result = v
                    print("%s: score=%d" % (best_result, best_score))

        if not best_result:
            return

        if best_score < self.name_match_threshold:
            print("%s has score %d, rejecting" %
                  (str(best_result), best_score))
            best_result = None

        return best_result
コード例 #7
0
ファイル: anilibria.py プロジェクト: PlayShikiApp/parsers
	def get_videos_list(self, anime_english, episode_num, type_ = ""):
		existing_video = AnimeVideo.query.filter(AnimeVideo.anime_english == anime_english, AnimeVideo.episode == episode_num, AnimeVideo.url.like("%libria%")).first()
		if existing_video:
			return self.handler_epidode_exists(anime_english, episode_num, existing_video.url)

		try:
			obj = self.parse_anime_page(anime_english, type_)
			authors, videos = obj
		except:
			#print("parse_anime_page returned %s" % str(obj))
			raise

		if not episode_num in videos:
			return self.handler_epidode_not_found(anime_english, episode_num)

		if not authors:
			return self.handler_authors_not_found(anime_english)

		videos_list = pd.DataFrame(columns = ["url", "episode", "kind", "quality", "video_hosting", "language", "author"])

		video_url = videos[episode_num]
		quality = "unknown"
		try:
			quality = self.get_quality(video_url)
		except:
			tools.catch()

		videos_list = videos_list.append({
			"url": self.url_to_embed(video_url),
			"episode": str(episode_num),
			"video_hosting": self.netloc,
			"author": authors,
			"quality": quality,
			"language": "russian",
			"kind": self.to_db_kind["fandub"]
		}, ignore_index = True)

		return videos_list
コード例 #8
0
def find_animes(parsers=OrderedDict([("anilibria", anilibria.AnilibriaParser),
                                     ("smotretanime", anime365.Anime365Parser),
                                     ("sovetromantica",
                                      sovetromantica.SRParser),
                                     ("sibnet", sibnet.SibnetParser)]),
                anime_ids=[],
                media_kind=MEDIA_KIND_VIDEOS,
                fetch_only_ongoings=True,
                fetch_all_episodes=False,
                filter_by_unique_url=True,
                use_anime_aliases=True):

    if not anime_ids:
        ongoings.main()
        if fetch_only_ongoings:
            anime_ids = ongoings.ONGOING_IDS
        else:
            print(misc.MANUALLY_TRACKED_IDS)
            anime_ids = ongoings.ONGOING_IDS + misc.MANUALLY_TRACKED_IDS

    result = pd.DataFrame()
    for hosting, Parser in parsers.items():
        print("hosting: " + hosting)
        parser = Parser()
        if not parser.is_media_kind_supported(media_kind):
            print("parser doesn't support media kind %s" % media_kind)
            continue
        else:
            print("fetching media kind %s" % media_kind)

        total = len(anime_ids)
        for n, anime_id in enumerate(anime_ids, start=1):
            note = "found"
            shiki_ongoing_data = {}
            try:
                anime_info = routes.get_anime_info(anime_id)
            except:
                if not (fetch_only_ongoings or id in ongoings.ONGOING_IDS):
                    note = "not found"
                    print("[%d / %d]: %s" % (n, total, note))
                    continue
                catch()
                try:
                    shiki_ongoing_data = ongoings.parse_ongoing(
                        ongoings.get_ongoing_html(anime_id))
                    if not shiki_ongoing_data:
                        continue
                except:
                    catch()
                    continue
                if not shiki_ongoing_data[
                        "anime_russian"] or not shiki_ongoing_data[
                            "anime_english"]:
                    note = "not found in database and couldn't retrieve anime names, skipping"
                    print("[%d / %d]: %s: %s" %
                          (n, total, anime_info["anime_english"], note))
                    continue

                note = "not found in database, will create first entries"
                anime_info = {
                    "anime_english": shiki_ongoing_data["anime_english"],
                    "anime_russian": shiki_ongoing_data["anime_russian"],
                    "duration": 0
                }
                print("[%d / %d] %s: %s" %
                      (n, total, anime_info["anime_english"], note))

            if anime_info["anime_english"] in misc.SKIP:
                note = "anime was explicitly specified to skip fetch"
                print("[%d / %d] %s: %s" %
                      (n, total, anime_info["anime_english"], note))
                continue

            search_kwargs = {}

            if fetch_only_ongoings or id in ongoings.ONGOING_IDS:
                if not shiki_ongoing_data:
                    try:
                        shiki_ongoing_data = ongoings.parse_ongoing(
                            ongoings.get_ongoing_html(anime_id))
                    except:
                        catch()
                        continue

                if not shiki_ongoing_data:
                    continue

                if shiki_ongoing_data["type"]:
                    #print("type: %s" % shiki_ongoing_data["type"])
                    search_kwargs["type_"] = shiki_ongoing_data["type"]

            if use_anime_aliases:
                search_kwargs["anime_aliases"] = []
                if anime_info["anime_russian"]:
                    search_kwargs["anime_aliases"] = [
                        anime_info["anime_russian"]
                    ]
                in_forced_list = False
                if hosting in misc.FORCE_ALIASES:
                    in_forced_list = anime_info[
                        "anime_english"] in misc.FORCE_ALIASES[hosting]
                #print("%s: in forced list: %d: %s" %(anime_info["anime_english"], in_forced_list, str(misc.FORCE_ALIASES[hosting])))
                #if in_forced_list:
                #	aliases = misc.FORCE_ALIASES[hosting][anime_info["anime_english"]]
                #	for a in aliases:
                #		if not a.endswith(".html"):
                #			anime_info["anime_english"] = a
                #			break

                #if (hosting in misc.FORCE_ALIASES) and (anime_info["anime_english"] in misc.FORCE_ALIASES[hosting]):
                #	aliases = misc.FORCE_ALIASES[hosting][anime_info["anime_english"]]
                #	#print("%s: forcing name '%s' because found in FORCE_ALIASES" % (anime_info["anime_english"], forced_name))
                #	#anime_info["anime_english"] = forced_name
                #	for a in aliases:
                #		if not a.endswith(".html"):
                #			search_kwargs["anime_aliases"]  = a
                #			break

            if not parser.search_anime(anime_info["anime_english"], **
                                       search_kwargs):
                note = "not found"
                print("[%d / %d] %s: %s" %
                      (n, total, anime_info["anime_english"], note))
                continue

            print("[%d / %d] %s: %s" %
                  (n, total, anime_info["anime_english"], note))

            if media_kind == MEDIA_KIND_VIDEOS:
                tmp_videos_list = get_videos_list(parser, n, total, anime_id,
                                                  hosting, anime_info,
                                                  shiki_ongoing_data,
                                                  fetch_only_ongoings,
                                                  fetch_all_episodes,
                                                  filter_by_unique_url)

            if (isinstance(tmp_videos_list,
                           type(None))) or tmp_videos_list.empty:
                continue

            result = merge_search_results(result, tmp_videos_list)
    return result
コード例 #9
0
    def get_videos_list(self, anime_english, episode_num, type_=""):
        episodes = []
        anime_page = self.search_anime(anime_english, type_)
        if not anime_page:
            return self.handler_anime_not_found(anime_english)

        content_main = BeautifulSoup(anime_page, features="html5lib")
        episodes += [
            a.get("href") for a in content_main.find_all(
                "a", {"class": "episodeButtonDownload"})
        ]

        if not episodes:
            return self.handler_episodes_list_not_found(anime_english)

        nav = {}
        try:
            nav = {
                a.text: a.get("href")
                for a in content_main.find("div", {
                    "class": "episode_info"
                }).find("nav").find_all("a")
            }
        except:
            tools.catch()

        if "Озвучка" in nav:
            url = self.build_url(path=nav["Озвучка"])
            page_name = os.path.join(anime_english, "fandub.html")
            #print(page_name, url)
            anime_page = self.load_or_save_page(page_name, url)
            content = BeautifulSoup(anime_page, features="html5lib")
            episodes += [
                a.get("href") for a in content.find_all(
                    "a", {"class": "episodeButtonDownload"})
            ]

        videos_list = pd.DataFrame(columns=[
            "url", "episode", "kind", "quality", "video_hosting", "language",
            "author"
        ])
        anime_id = self.get_anime_id(episodes[0])
        if not anime_id.isdigit():
            return
        anime_id = int(anime_id)
        quality = "unknown"
        author = "unknown"

        for url in episodes:
            episode_num_ = self.get_episode_num(url)
            if not episode_num_ or not (episode_num_.isdigit()):
                continue

            episode_num_ = int(episode_num_)

            if episode_num_ != episode_num:
                continue

            kind = self.get_anime_kind(url)
            shiki_kind = self.to_shiki_kind[kind]
            all_authors = content.find_all("div", {"class": "anime-team"})
            if shiki_kind == "subtitles":
                try:
                    author = " & ".join(
                        [s for s in all_authors[0].strings if not s.isspace()])
                    print(author)
                except:
                    pass
                    tools.catch()
                try:
                    all_authors = [[
                        i for i in list(d.strings) if not i.isspace()
                    ]
                                   for d in content_main.find_all(
                                       "div", {"class": "anime-team"})][0]
                    author = " & ".join(all_authors)
                    print(author)
                except:
                    pass
                    tools.catch()

            try:
                if shiki_kind == "fandub":
                    author = " & ".join(
                        [s for s in all_authors[1].strings if not s.isspace()])
                    print(author)
            except (IndexError, AttributeError):
                tools.catch()
                continue

            #print(url, kind)
            videos_list = videos_list.append(
                {
                    "url": self.url_to_embed(url, kind),
                    "episode": str(episode_num_),
                    "video_hosting": self.netloc,
                    "author": author,
                    "quality": quality,
                    "language": "russian",
                    "kind": self.to_db_kind[shiki_kind]
                },
                ignore_index=True)

        return videos_list
コード例 #10
0
ファイル: anilibria.py プロジェクト: PlayShikiApp/parsers
	def search_anime(self, anime_english, anime_aliases = [], type_ = ""):
		names = [anime_english]
		anime_page_url = ""

		#print("%s aliases = %s" % (anime_english, misc.FORCE_ALIASES["anilibria"][anime_english]))

		if anime_english in misc.FORCE_ALIASES["anilibria"]:
			for a in misc.FORCE_ALIASES["anilibria"][anime_english]:
				if not a.endswith(".html"):
					names += [a]
				else:
					print("release = %s" % a)
					anime_page_url = a

		found = (anime_page_url != "")
		if anime_page_url:
			try:
				#print(anime_page_url)
				res = self.browser_open(self.build_url(path = anime_page_url))
			except RuntimeError:
				tools.catch()

			page_name = "%s.html" % names[0]
			page_data = res.get_data()
			self.save_page(page_name, page_data)

			return page_data


		print("anilibria: search_anime: anime_english=%s, anime_aliases=%s, names=%s" % (anime_english, str(anime_aliases), str(names)))
		for anime_name in names:
			page_name = "%s.html" % anime_name
			page_data = self.load_page(page_name)
			print(anime_name)
			if not page_data:
				print("!page_data")
				try:
					built_url, query_kwargs = self.build_search_url(anime_name, method = "POST")
					res = self.browser_open(built_url, method = "POST", data = query_kwargs)
				except RuntimeError:
					tools.catch()
					continue
				resp = res.read()

				if not resp:
					self.save_page(page_name, b"")
					print("!resp")
					continue

				try:
					resp = demjson.decode(resp)
				except:
					tools.catch()
					continue

				if not "err" in resp or not "mes" in resp or resp["err"] != "ok":
					print("resp is not ok")
					continue

				resp_data = resp["mes"]
				if not resp_data and not anime_page_url:
					print("!resp_data")
					continue

				if not anime_page_url:
					anime_page_url = self._find_best_match(resp_data, anime_names = anime_aliases)

				print("search: anime_page_url=%s" % anime_page_url)
				if not anime_page_url:
					print("!anime_page_url")
					continue
				try:
					#print(anime_page_url)
					res = self.browser_open(self.build_url(path = anime_page_url))
				except RuntimeError:
					tools.catch()
					continue

			found = True
			break

		if not found:
			return None

		page_data = res.get_data()
		self.save_page(page_name, page_data)

		return page_data