def get_detail(self, detail_page_url: str): url = self._base_url + detail_page_url logger.info(f"Parsing detail page: {url}") resp = self.get(url) if resp.status_code != 200: logger.warning(f"Response error: {resp.status_code} {url}") return AnimeDetailInfo() body = self.xpath(resp.text, '//div[@class="fire l"]')[0] anime_detail = AnimeDetailInfo() anime_detail.title = body.xpath("./div/h1/text()")[0] anime_detail.category = " ".join( body.xpath('.//div[@class="sinfo"]/span[3]/a/text()')) anime_detail.desc = body.xpath( './/div[@class="info"]/text()')[0].replace("\r\n", "").strip() anime_detail.cover_url = body.xpath( './/div[@class="thumb l"]/img/@src')[0] vc = VideoCollection() vc.name = "播放列表" video_blocks = body.xpath('.//div[@class="movurl"]//li') for block in video_blocks: video = Video() video.name = block.xpath("./a/text()")[0] video.raw_url = block.xpath("./a/@href")[0] # '/v/3849-162.html' video.handler = "YHDMVideoHandler" vc.append(video) anime_detail.append(vc) return anime_detail
def clear(self) -> float: """清空数据, 返回清理的内存大小(KB)""" logger.warning( f"CacheDB has been cleared, object in total: {len(self._db)}") size = self.size() self._db.clear() return size
async def search_danmaku( self, keyword: str, *, callback: Callable[[DanmakuMeta], None] = None, co_callback: Callable[[DanmakuMeta], Coroutine] = None ) -> None: """ 搜索弹幕库 """ async def run(searcher: DanmakuSearcher): logger.info(f"{searcher.__class__.__name__} is searching for [{keyword}]") if callback is not None: async for item in searcher._search(keyword): callback(item) return if co_callback is not None: async for item in searcher._search(keyword): await co_callback(item) searchers = self._loader.get_danmaku_searcher() if not searchers: logger.warning(f"No danmaku searcher enabled") return logger.info(f"Searching Danmaku -> [{keyword}], enabled engines: {len(searchers)}") start_time = perf_counter() await asyncio.wait([run(s) for s in searchers]) end_time = perf_counter() logger.info(f"Searching danmaku finished in {end_time - start_time:.2f}s")
def get_real_url(self): """通过视频的 play_id 获取视频链接""" play_url = "https://proxy.app.maoyuncloud.com/app/video/play" + self.get_raw_url() headers = {"User-Agent": "Dart/2.7 (dart:io)", "appid": "4150439554430555"} logger.debug(f"Parsing real url for {play_url}") resp = self.get(play_url, headers=headers) if resp.status_code != 200: logger.warning(f"Response error: {resp.status_code} {play_url}") return "error" data = resp.json()["data"][0] real_url = data["url"] parse_js = data.get("parse") if parse_js: # 需要进一步处理 logger.debug(parse_js) parse_apis = re.findall(r'"(https?://.+?)"', parse_js) # 可能存在多个解析接口 for api in parse_apis: url = api + real_url resp = self.get(url) real_url = resp.json().get("url") if real_url is not None: break # 已经得到了 elif "qq.com" in real_url: resp = self.head(real_url, allow_redirects=False) real_url = resp.headers.get("Location") # 重定向之后才是直链 logger.debug(f"Video real url: {real_url}") return real_url
async def parse(self, play_url: str): """获取视频详情""" detail = DanmakuDetail() resp = await self.get(play_url) if not resp or resp.status != 200: return detail html = await resp.text() data = re.search(r"__INITIAL_DATA__\s*?=\s*?({.+?});", html) if not data: # 多半是碰到反爬机制了 logger.warning("We are blocked by youku") return detail # 下面是 data 和 node 结点的疯狂套娃, 我们需要的数据在第 13 层 # 我的圣母玛利亚, 这是一坨何其冗余庞杂的 shit # 写出这个代码的程序员应该被送上绞刑架, 然后用文火慢炖 data = json.loads(data.group(1)) data = data["data"]["data"]["nodes"][0]["nodes"] # nodes 是一个列表, 其中 type == 10013 的元素才是视频播放列表的数据 data = list(filter(lambda x: x["type"] == 10013, data))[0] # 数据在这个结点的 nodes 节点下 data = data["nodes"] for item in data: info = item["data"] if info["videoType"] != "正片": continue # 可能混入预告片什么的 danmaku = Danmaku() danmaku.name = info["title"] danmaku.cid = info["action"]["value"] # 视频id "XMzk4NDE2Njc4OA==" detail.append(danmaku) return detail
def disable_engine(self, engine: str) -> bool: """禁用某个引擎""" if engine in self.get_all_engines(): logger.warning(f"Engine {engine} disabled") self._dict["engines"][engine] = False self._save() return True return False
def disable_danmaku(self, danmaku: str) -> bool: """禁用某个弹幕引擎""" if danmaku in self.get_all_danmaku(): logger.warning(f"Danmaku {danmaku} disabled") self._dict["danmaku"][danmaku] = False self._save() return True return False
async def parse_anime_real_url(self, anime: Anime) -> AnimeInfo: """解析一集视频的直链""" url_parser = self._loader.get_anime_url_parser(anime.module) logger.info(f"{url_parser.__class__.__name__} parsing {anime.raw_url}") url = await url_parser._parse(anime.raw_url) if url.is_available(): return url logger.warning(f"Parse real url failed") return AnimeInfo()
def get_video_data(hash_key: str): """通过API代理访问, 获取视频数据流""" video = self._anime_db.fetch(hash_key) if not video: return "URL Invalid" if not video.real_url: logger.warning("Not real url") real_url = self._engine_mgr.get_video_url(video) video.real_url = real_url self._anime_db.update(hash_key, video) return self._engine_mgr.make_response_for(video)
async def parse_anime_real_url(self, anime: Anime) -> AnimeInfo: """解析一集视频的直链""" url_parser = self._loader.get_anime_url_parser(anime.module) logger.info(f"{url_parser.__class__.__name__} parsing {anime.raw_url}") for _ in range(3): # 3 次解析机会, 再不行就真的不行了 url = await url_parser._parse(anime.raw_url) if url.is_available(): return url logger.warning(f"Parse real url failed, retry...") logger.warning( f"Parse real url failed 3 times, maybe this resource is not available" ) return AnimeInfo()
def search(self, keyword: str): logger.info(f"Searching for: {keyword}") resp = self.get(self._search_api, params={"limit": "100", "key": keyword, "page": "1"}, headers=self._headers) if resp.status_code != 200 or resp.json()["data"]["total"] == 0: logger.warning(f"Response error: {resp.status_code} {self._search_api}") return anime_meta_list = resp.json().get("data").get("items") for meta in anime_meta_list: anime = AnimeMetaInfo() anime.title = meta["name"] anime.cover_url = meta["pic"] anime.category = meta["type"] anime.detail_page_url = meta["id"] yield anime
def search(self, keyword: str): logger.info(f"Searching for: {keyword}") resp = self.post(self._search_api, data={"userid": "", "key": keyword}) if resp.status_code != 200: logger.warning( f"Response error: {resp.status_code} {self._search_api}") return anime_meta_list = resp.json().get("data") for meta in anime_meta_list: anime = AnimeMetaInfo() anime.title = meta["videoName"] anime.cover_url = meta["videoImg"] anime.category = meta["videoClass"] anime.detail_page_url = meta["videoId"] yield anime
async def post(self, url: str, data: dict = None, **kwargs) -> Optional[ClientResponse]: """ POST 方法, 使用随机 User-Agent, 出现异常时返回 None """ try: url = self.set_headers(url, kwargs) logger.debug(f"POST {url} | Data: {data} | Args: {kwargs}") resp = await self.session.post(url, data=data, **kwargs) logger.debug( f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})" ) return resp except Exception as e: logger.warning(f"Exception in {self.__class__}: {e}")
async def head(self, url: str, params: dict = None, **kwargs) -> Optional[ClientResponse]: """ HEAD 方法, 使用随机 User-Agent, 出现异常时返回 None """ try: url = self.set_headers(url, kwargs) logger.debug(f"HEAD {url} | Params: {params} | Args: {kwargs}") resp = await self.session.head(url, params=params, **kwargs) logger.debug( f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})" ) return resp except Exception as e: logger.warning(f"Exception in {self.__class__}: {e}")
async def get(self, url: str, params: dict = None, **kwargs) -> Optional[ClientResponse]: """ GET 方法, 使用随机 User-Agent, 出现异常时返回 None """ try: self.set_headers(kwargs) logger.debug(f"GET {url} | Params: {params} | Args: {kwargs}") resp = await self.session.get(url, params=params, **kwargs) logger.debug( f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})" ) return resp except TimeoutError: logger.warning(f"Connection timed out: {url}") except Exception as e: logger.warning(e)
def get_detail(self, anime_id: str): resp = self.get(self._detail_api, params={"id": anime_id}, headers=self._headers) if resp.status_code != 200: logger.warning(f"Response error: {resp.status_code} {self._search_api}") return AnimeDetailInfo() detail = resp.json().get("data") # 视频详情信息 anime_detail = AnimeDetailInfo() anime_detail.title = detail["name"] anime_detail.cover_url = detail["pic"] anime_detail.desc = detail["content"] # 完整的简介 anime_detail.category = detail["type"] for play_list in detail["parts"]: vc = VideoCollection() # 番剧的视频列表 vc.name = play_list["play_zh"] # 列表名, 线路 I, 线路 II for name in play_list["part"]: video_params = f"?id={anime_id}&play={play_list['play']}&part={name}" vc.append(Video(name, video_params, "BimibimiVideoHandler")) anime_detail.append(vc) return anime_detail
async def search_anime( self, keyword: str, *, callback: Callable[[AnimeMeta], None] = None, co_callback: Callable[[AnimeMeta], Coroutine] = None) -> None: """ 异步搜索动漫 :param keyword: 关键词 :param callback: 处理搜索结果的回调函数 :param co_callback: 处理搜索结果的协程函数 如果设置了 callback, 忽视 co_callback """ if not keyword: return async def run(searcher: AnimeSearcher): logger.info( f"{searcher.__class__.__name__} is searching for [{keyword}]") if callback is not None: async for item in searcher._search(keyword): callback(item) # 每产生一个搜索结果, 通过回调函数处理 return if co_callback is not None: async for item in searcher._search(keyword): await co_callback(item) searchers = self._loader.get_anime_searchers() if not searchers: logger.warning(f"No anime searcher enabled") return logger.info( f"Searching Anime -> [{keyword}], enabled engines: {len(searchers)}" ) start_time = perf_counter() await asyncio.wait([run(s) for s in searchers]) end_time = perf_counter() logger.info( f"Searching anime finished in {end_time - start_time:.2f}s")
def get_real_url(self) -> str: url = "http://www.yhdm.tv/" + self.get_raw_url() logger.info(f"Parsing real url for {url}") resp = self.get(url) if resp.status_code != 200: logger.warning(f"Response error: {resp.status_code} {url}") return "error" video_url = self.xpath( resp.text, '//div[@id="playbox"]/@data-vid')[0] # "url$format" video_url = video_url.split( "$" )[0] # "http://quan.qq.com/video/1098_ae4be38407bf9d8227748e145a8f97a5" if not video_url.startswith("http"): # 偶尔出现一些无效视频 logger.warning(f"This video is not valid: {video_url}") return "error" logger.debug(f"Redirect for {video_url}") resp = self.head(video_url, allow_redirects=True) # 获取直链时会重定向 2 次 logger.info(f"Video real url: {resp.url}") return resp.url # 重定向之后的视频直链
def search(self, keyword: str): logger.info(f"Searching for danmaku: {keyword}") resp = self.get(self._search_api, params={ "limit": "100", "key": keyword, "page": "1" }, headers=self._headers) if resp.status_code != 200 or resp.json()["data"]["total"] == 0: logger.warning( f"Response error: {resp.status_code} {self._search_api}") return anime_meta_list = resp.json().get("data").get("items") for anime in anime_meta_list: meta = DanmakuMetaInfo() meta.title = anime["name"] meta.play_page_url = str(anime["id"]) meta.num = anime["total"] yield meta
def get_detail(self, detail_page_url: str): resp = self.get(self._detail_api, params={"vid": detail_page_url}) if resp.status_code != 200 or resp.json()["code"] != 1: logger.warning( f"Response error: {resp.status_code} {self._search_api}") return AnimeDetailInfo() detail = resp.json().get("data") # 视频详情信息 anime_detail = AnimeDetailInfo() anime_detail.title = detail["name"] anime_detail.cover_url = detail["pic"] anime_detail.desc = detail["label"] anime_detail.category = detail["type"] vc = VideoCollection() vc.name = "视频列表" video_set = dict(detail["playUrl"]) for name, url in video_set.items(): vc.append(Video(name, url)) anime_detail.append(vc) return anime_detail
def parse_one_page(self, keyword: str, page: int): logger.info(f"Searching for {keyword}, page {page}") resp = self.get(self._search_api + "/" + keyword, params={"page": page}) if resp.status_code != 200: logger.warning( f"Response error: {resp.status_code} {self._search_api}") return [], "" anime_meta_list = self.xpath(resp.text, '//div[@class="lpic"]//li') ret = [] for meta in anime_meta_list: anime = AnimeMetaInfo() anime.title = " ".join(meta.xpath(".//h2/a/@title")) anime.cover_url = meta.xpath("./a/img/@src")[0] anime.category = " ".join(meta.xpath("./span[2]/a/text()")) anime.desc = meta.xpath("./p/text()")[0] anime.detail_page_url = meta.xpath("./a/@href")[ 0] # /show/5031.html ret.append(anime) return ret, resp.text
def search(self, keyword: str): logger.info(f"Searching for: {keyword}") resp = self.get(self._search_api, params={ "kw": keyword, "per_page": 100, "page": 1 }) # 取前 100 条结果 if resp.status_code != 200 or resp.json()["code"] != 1: logger.warning( f"Response error: {resp.status_code} {self._search_api}") return data = resp.json() anime_meta_list = data.get("data").get("data") if data else [] for meta in anime_meta_list: anime = AnimeMetaInfo() anime.title = meta["name"] anime.cover_url = meta["pic"] anime.category = meta["type"] anime.detail_page_url = str(meta["vid"]) anime.desc = meta["label"] yield anime
def detect_video_format(self) -> str: """判断视频真正的格式, url 可能没有视频后缀""" # 尝试从 url 提取后缀 url = self.get_cached_real_url() try: ext = url.split("?")[0].split(".")[-1].lower() if ext in ["mp4", "flv"]: return ext if ext == "m3u8": return "hls" except (IndexError, AttributeError): pass # 视频的元数据中包含了视频的格式信息, 在视频开头寻找十六进制标识符推断视频格式 format_hex = { "mp4": ["69736F6D", "70617663", "6D703432", "4D50454734", "4C617666"], "flv": ["464C56"], "hls": ["4558544D3355"] } _, data_iter = self._get_stream_from_server(0, 512) if not data_iter: logger.warning("Could not get video stream from server") return "unknown" logger.debug("Detecting video format from binary stream") video_meta = next(data_iter).hex().upper() for format_, hex_list in format_hex.items(): for hex_sign in hex_list: if hex_sign in video_meta: logger.debug(f"Video format: {format_}") return format_ logger.error("Could not detect video format from stream") logger.debug("Video raw binary stream (512byte):") logger.debug(video_meta) return "unknown"
def get_detail(self, detail_page_url: str): resp = self.get(self._detail_api, params={ "userid": "", "videoId": detail_page_url }) if resp.status_code != 200: logger.warning( f"Response error: {resp.status_code} {self._search_api}") return AnimeDetailInfo() detail = resp.json().get("data") # 视频详情信息 anime_detail = AnimeDetailInfo() anime_detail.title = detail["videoName"] anime_detail.cover_url = detail["videoImg"] anime_detail.desc = detail["videoDoc"].replace("\r\n", "") # 完整的简介 anime_detail.category = detail["videoClass"] for play_list in detail["videoSets"]: vc = VideoCollection() # 番剧的视频列表 vc.name = play_list["load"] # 列表名, 线路 I, 线路 II for video in play_list["list"]: vc.append( Video(video["ji"], video["playid"], "ZZFunVideoHandler")) anime_detail.append(vc) return anime_detail
def get_real_url(self): """通过视频的 play_id 获取视频链接""" play_api = "http://service-agbhuggw-1259251677.gz.apigw.tencentcs.com/android/video/107play" play_id = self.get_raw_url() secret_key = "zandroidzz" now = int(time.time() * 1000) # 13 位时间戳 sing = secret_key + str(now) sing = md5(sing.encode("utf-8")).hexdigest() logger.info(f"Parsing real url for {play_id}") payload = { "playid": play_id, "userid": "", "apptoken": "", "sing": sing, "map": now } resp = self.post(play_api, data=payload) if resp.status_code != 200: logger.warning(f"Response error: {resp.status_code} {play_api}") logger.debug(f"POST params: {payload}") return "error" real_url = resp.json()["data"]["videoplayurl"] logger.info(f"Video real url: {real_url}") return real_url
def clear(self): logger.warning(f"{self.__class__.__name__} cleaning, object in total: {len(self._db)}") self._db.clear()