def get_detail(self, play_page_url: str) -> DanmakuCollection: """获取视频详情""" ret = DanmakuCollection() resp = self.get(play_page_url) if resp.status_code != 200: return ret data = re.search(r"__INITIAL_DATA__\s*?=\s*?({.+?});", resp.text) if not data: # 多半是碰到反爬机制了 logger.error("We are blocked by youku") return ret data = json.loads(data.group(1)) # 我们需要的数据在第 13 层! 写出这种代码的程序员应该被绑到绞刑架上 data = data["data"]["data"]["nodes"][0]["nodes"] # nodes 是一个列表, 其中 type == 10013 的元素才是视频播放列表的数据 data = list(filter(lambda x: x["type"] == 10013, data))[0] # 数据在这个结点的 nodes 节点下 data = data["nodes"] for item in data: info = item["data"] if info["videoType"] != "正片": continue # 可能混入预告片什么的 dmk = Danmaku() dmk.name = info["title"] dmk.cid = info["action"]["value"] # 视频id "XMzk4NDE2Njc4OA==" ret.append(dmk) return ret
def test_analyser(self): analyser = TextAnalyser(related=False) self.assertIsInstance(analyser, TextAnalyser) for _, text in load_texts(): try: analyser.fit(text) self.assertTrue(hasattr(analyser, 'textrank_')) self.assertIsInstance(analyser.textrank_, TextRank) self.assertTrue(hasattr(analyser, 'articles_')) output = analyser.to_dict() self.assertIs(type(output), dict) self.assertIn('articles', output) self.assertIn('graph', output) keywords = analyser.textrank_.get_keywords(max_kws=10) self.assertIs(type(keywords), list) self.assertTrue(all(type(kw) is dict for kw in keywords)) logger.debug(str(keywords)) except NLPModelNotFound as e: logger.error(e)
def get_anime(self, p_index: int, ep_index: int) -> Optional[Anime]: """获取某一个播放列表的某个视频对象""" try: return self[p_index][ep_index] except IndexError: logger.error(f"IndexError, anime index: {p_index} {ep_index}") return None
def parse_one_page(self, keyword: str, page: int): url = f"{self._base_url}/vodsearch/{keyword}----------{page}---.html" resp = self.get(url) if resp.status_code != 200: return [], "" if "请输入验证码" in resp.text: logger.error( "We are blocked by K1080P, need to enter the verification code." ) return [], "" ret = [] meta_list = self.xpath( resp.text, "//ul[@class='stui-vodlist__media col-pd clearfix']//li") for meta in meta_list: anime = AnimeMetaInfo() cover_url = meta.xpath("./div[@class='thumb']/a/@data-original")[0] if not cover_url.startswith("http"): cover_url = self._base_url + cover_url anime.cover_url = cover_url anime.title = meta.xpath("./div[@class='detail']/h3/a/text()")[0] anime.detail_page_url = meta.xpath( "./div[@class='detail']/h3/a/@href")[ 0] # /voddetail/414362.html desc = meta.xpath( "./div[@class='detail']//span[contains(text(),'简介')]/parent::p/text()" ) anime.desc = desc[0] if desc else "无简介" anime.category = meta.xpath( "./div[@class='detail']//span[contains(text(),'类型')]/parent::p/text()" )[0] ret.append(anime) return ret, resp.text
def get_source(self, **identifiers:Union[str,List[str]]) -> RssFeed: source = self.find_all(**identifiers) if len(source) > 1: logger.warn(f"Parameters {identifiers} not unique") elif len(source) < 1: logger.error(f"No source found with parameters {identifiers}") raise KeyError(identifiers) return source[0]
async def fetch_html(self, keyword: str, page: int): url = f"https://www.k1080.net/vodsearch/{keyword}----------{page}---.html" resp = await self.get(url) if not resp or resp.status != 200: return "" html = await resp.text() if "请输入验证码" in html: logger.error("We are blocked by K1080P, need to enter the verification code.") return "" return html
def get_danmaku_data(self, dmk: Danmaku) -> List: """解析一部番剧的详情页,返回包含视频列表的详细信息""" if not dmk: logger.error(f"Invalid request") return [] target_engine = self._danmaku_engine.get(dmk.dm_engine) if not target_engine: logger.error(f"Danmaku Engine not found: {dmk.dm_engine}") return [] return target_engine()._get_danmaku(dmk.cid)
def get_danmaku_detail(self, meta: DanmakuMetaInfo) -> DanmakuCollection: """解析一部番剧的详情页,返回包含视频列表的详细信息""" if not meta: logger.error(f"Invalid request") return DanmakuCollection() target_engine = self._danmaku_engine.get(meta.dm_engine) if not target_engine: logger.error(f"Danmaku Engine not found: {meta.dm_engine}") return DanmakuCollection() return target_engine()._get_detail(meta.play_page_url)
def make_response_for(self, video: Video) -> requests.Response: """获取视频对应的 handler 对象, 用于代理访问数据并返回响应给客户端""" if not video: logger.error(f"Invalid request") return requests.Response() target_handler = self._handlers.get(video.handler) if not target_handler: logger.error(f"VideoHandler not found: {video.handler}") return requests.Response() return target_handler(video).make_response()
def get_video_url(self, video: Video) -> str: """解析视频真实 url""" if not video: logger.error(f"Invalid request") return "error" target_handler = self._handlers.get(video.handler) if not target_handler: logger.error(f"VideoHandler not found: {video.handler}") return "error" target_handler = target_handler(video) return target_handler.get_cached_real_url()
def change_module_state(self, module: str, enable: bool) -> bool: """动态加载/卸载引擎, 并更新配置文件""" try: if enable: # 加载引擎 self.load_full_module(module) return self._config.update_module_state(module, True) else: # 卸载引擎 self.unload_full_module(module) return self._config.update_module_state(module, False) except ModuleNotFoundError: logger.error(f"Module not found: {module}") return False
def get_anime_detail(self, meta: AnimeMetaInfo) -> AnimeDetailInfo: """解析一部番剧的详情页,返回包含视频列表的详细信息""" if not meta: logger.error(f"Invalid request") return AnimeDetailInfo() target_engine = self._engines.get(meta.engine) if target_engine is not None: return target_engine()._get_detail(meta.detail_page_url) # 如果引擎没加载, 临时加载一次 logger.info(f"Engine not found: {meta.engine}, it will be loaded temporarily.") self._load_engine(meta.engine) target_engine = self._engines.pop(meta.engine) logger.info(f"Unloading engine: {target_engine}") return target_engine()._get_detail(meta.detail_page_url)
def save_resource(resource: Json, counts: Dict[str, int]) -> Dict[str, int]: try: if not resource["body"]: raise ValueError("Empty body") article_id = md5(resource["body"].encode()).hexdigest() article = Article(meta={"id": article_id}, **resource) article.save() counts["successful"] += 1 except Exception as err: logger.error(err) counts["failed"] += 1 counts["total"] += 1 return counts
async def _parse(self, raw_url: str) -> AnimeInfo: """解析直链, 捕获引擎模块未处理的异常""" try: await self._before_init() await self.init_session() info = await self.parse(raw_url) if not isinstance(info, AnimeInfo): info = AnimeInfo(info) # 方便 parse 直接返回字符串链接 await info.detect_more_info() if info.is_available(): # 解析成功 logger.info(f"Parse success: {info}") logger.info(f"Real url: {info.real_url}") return info logger.error(f"Parse failed: {info}") return AnimeInfo() except Exception as e: logger.exception(e) return AnimeInfo()
def detect_video_format(self) -> str: """判断视频真正的格式, url 可能没有视频后缀""" # 尝试从 url 提取后缀 url = self.get_cached_real_url() try: ext = url.split("?")[0].split(".")[-1].lower() if ext in ["mp4", "flv"]: return ext if ext == "m3u8": return "hls" except (IndexError, AttributeError): pass # 视频的元数据中包含了视频的格式信息, 在视频开头寻找十六进制标识符推断视频格式 format_hex = { "mp4": ["69736F6D", "70617663", "6D703432", "4D50454734", "4C617666"], "flv": ["464C56"], "hls": ["4558544D3355"] } _, data_iter = self._get_stream_from_server(0, 512) if not data_iter: logger.warning("Could not get video stream from server") return "unknown" logger.debug("Detecting video format from binary stream") video_meta = next(data_iter).hex().upper() for format_, hex_list in format_hex.items(): for hex_sign in hex_list: if hex_sign in video_meta: logger.debug(f"Video format: {format_}") return format_ logger.error("Could not detect video format from stream") logger.debug("Video raw binary stream (512byte):") logger.debug(video_meta) return "unknown"
def get_danmaku(self, index: int) -> Optional[Danmaku]: try: return self._dmk_list[index] except IndexError: logger.error(f"IndexError, danmaku index: {index}") return None
def fetch_articles( terms: List[str], source_language: str, search_languages: Optional[List[str]] = None, output_language: Optional[str] = None, # TODO: Unused country: Optional[str] = None, # TODO: Unused sources: Optional[str] = None, # TODO: Unused groupby_options: Optional[Json] = None) -> Json: if not type(terms) is list: raise TypeError( "terms must be a list of keywords eg: ['Donald Trump', 'Elections']" ) query_terms = ",".join(terms) search_languages = search_languages or ["en"] if not type(search_languages) is list: raise TypeError( "search_languages must be a list of language codes eg: ['en', 'nl']" ) translations = {} # Translate terms in english to minimise risk of missing IBM model if source_language != 'en': translations[source_language] = query_terms query_terms = translate(query_terms, source_language, 'en') translations['en'] = query_terms logger.debug(query_terms) search = Article.search() articles = [] for lang in search_languages: if lang not in translations: translated = translate(query_terms, 'en', lang) if not translated: logger.error(f"Could not translate {query_terms} in {lang}") continue translations[lang] = translated logger.debug(translated) terms = translations[lang].split(",") minimum_should_match = int(0.5 * len(terms)) query = Q('bool', must=Q("match", language=lang), minimum_should_match=minimum_should_match, should=[ Q("multi_match", fields=['body', 'title'], type='phrase', query=term.strip()) for term in terms ]) query = search.query(query) logger.debug(query.to_dict()) results = query.execute() logger.info(f"Got {results.hits.total.value} hits") for hit in results: logger.info(f"{hit.meta.score}: {hit.title}") article = hit.to_dict() article["relevance"] = hit.meta.score articles.append(article) if groupby_options: articles = groupby_category(articles, **groupby_options) # if output_language is not None: # results = translate_results(results, output_language) return {"articles": articles}