Beispiel #1
0
 def get_detail(self, play_page_url: str) -> DanmakuCollection:
     """获取视频详情"""
     ret = DanmakuCollection()
     resp = self.get(play_page_url)
     if resp.status_code != 200:
         return ret
     data = re.search(r"__INITIAL_DATA__\s*?=\s*?({.+?});", resp.text)
     if not data:  # 多半是碰到反爬机制了
         logger.error("We are blocked by youku")
         return ret
     data = json.loads(data.group(1))
     # 我们需要的数据在第 13 层! 写出这种代码的程序员应该被绑到绞刑架上
     data = data["data"]["data"]["nodes"][0]["nodes"]
     # nodes 是一个列表, 其中 type == 10013 的元素才是视频播放列表的数据
     data = list(filter(lambda x: x["type"] == 10013, data))[0]
     # 数据在这个结点的 nodes 节点下
     data = data["nodes"]
     for item in data:
         info = item["data"]
         if info["videoType"] != "正片":
             continue  # 可能混入预告片什么的
         dmk = Danmaku()
         dmk.name = info["title"]
         dmk.cid = info["action"]["value"]  # 视频id  "XMzk4NDE2Njc4OA=="
         ret.append(dmk)
     return ret
Beispiel #2
0
    def test_analyser(self):
        analyser = TextAnalyser(related=False)
        self.assertIsInstance(analyser, TextAnalyser)

        for _, text in load_texts():

            try:

                analyser.fit(text)
                self.assertTrue(hasattr(analyser, 'textrank_'))
                self.assertIsInstance(analyser.textrank_, TextRank)
                self.assertTrue(hasattr(analyser, 'articles_'))

                output = analyser.to_dict()
                self.assertIs(type(output), dict)
                self.assertIn('articles', output)
                self.assertIn('graph', output)

                keywords = analyser.textrank_.get_keywords(max_kws=10)
                self.assertIs(type(keywords), list)
                self.assertTrue(all(type(kw) is dict for kw in keywords))
                logger.debug(str(keywords))

            except NLPModelNotFound as e:
                logger.error(e)
Beispiel #3
0
 def get_anime(self, p_index: int, ep_index: int) -> Optional[Anime]:
     """获取某一个播放列表的某个视频对象"""
     try:
         return self[p_index][ep_index]
     except IndexError:
         logger.error(f"IndexError, anime index: {p_index} {ep_index}")
         return None
Beispiel #4
0
    def parse_one_page(self, keyword: str, page: int):
        url = f"{self._base_url}/vodsearch/{keyword}----------{page}---.html"
        resp = self.get(url)
        if resp.status_code != 200:
            return [], ""
        if "请输入验证码" in resp.text:
            logger.error(
                "We are blocked by K1080P, need to enter the verification code."
            )
            return [], ""

        ret = []
        meta_list = self.xpath(
            resp.text,
            "//ul[@class='stui-vodlist__media col-pd clearfix']//li")
        for meta in meta_list:
            anime = AnimeMetaInfo()
            cover_url = meta.xpath("./div[@class='thumb']/a/@data-original")[0]
            if not cover_url.startswith("http"):
                cover_url = self._base_url + cover_url
            anime.cover_url = cover_url
            anime.title = meta.xpath("./div[@class='detail']/h3/a/text()")[0]
            anime.detail_page_url = meta.xpath(
                "./div[@class='detail']/h3/a/@href")[
                    0]  # /voddetail/414362.html
            desc = meta.xpath(
                "./div[@class='detail']//span[contains(text(),'简介')]/parent::p/text()"
            )
            anime.desc = desc[0] if desc else "无简介"
            anime.category = meta.xpath(
                "./div[@class='detail']//span[contains(text(),'类型')]/parent::p/text()"
            )[0]
            ret.append(anime)
        return ret, resp.text
Beispiel #5
0
 def get_source(self, **identifiers:Union[str,List[str]]) -> RssFeed:
     source = self.find_all(**identifiers)
     if len(source) > 1:
         logger.warn(f"Parameters {identifiers} not unique")
     elif len(source) < 1:
         logger.error(f"No source found with parameters {identifiers}")
         raise KeyError(identifiers)
     return source[0]
Beispiel #6
0
 async def fetch_html(self, keyword: str, page: int):
     url = f"https://www.k1080.net/vodsearch/{keyword}----------{page}---.html"
     resp = await self.get(url)
     if not resp or resp.status != 200:
         return ""
     html = await resp.text()
     if "请输入验证码" in html:
         logger.error("We are blocked by K1080P, need to enter the verification code.")
         return ""
     return html
Beispiel #7
0
 def get_danmaku_data(self, dmk: Danmaku) -> List:
     """解析一部番剧的详情页,返回包含视频列表的详细信息"""
     if not dmk:
         logger.error(f"Invalid request")
         return []
     target_engine = self._danmaku_engine.get(dmk.dm_engine)
     if not target_engine:
         logger.error(f"Danmaku Engine not found: {dmk.dm_engine}")
         return []
     return target_engine()._get_danmaku(dmk.cid)
Beispiel #8
0
 def get_danmaku_detail(self, meta: DanmakuMetaInfo) -> DanmakuCollection:
     """解析一部番剧的详情页,返回包含视频列表的详细信息"""
     if not meta:
         logger.error(f"Invalid request")
         return DanmakuCollection()
     target_engine = self._danmaku_engine.get(meta.dm_engine)
     if not target_engine:
         logger.error(f"Danmaku Engine not found: {meta.dm_engine}")
         return DanmakuCollection()
     return target_engine()._get_detail(meta.play_page_url)
Beispiel #9
0
 def make_response_for(self, video: Video) -> requests.Response:
     """获取视频对应的 handler 对象, 用于代理访问数据并返回响应给客户端"""
     if not video:
         logger.error(f"Invalid request")
         return requests.Response()
     target_handler = self._handlers.get(video.handler)
     if not target_handler:
         logger.error(f"VideoHandler not found: {video.handler}")
         return requests.Response()
     return target_handler(video).make_response()
Beispiel #10
0
 def get_video_url(self, video: Video) -> str:
     """解析视频真实 url"""
     if not video:
         logger.error(f"Invalid request")
         return "error"
     target_handler = self._handlers.get(video.handler)
     if not target_handler:
         logger.error(f"VideoHandler not found: {video.handler}")
         return "error"
     target_handler = target_handler(video)
     return target_handler.get_cached_real_url()
Beispiel #11
0
 def change_module_state(self, module: str, enable: bool) -> bool:
     """动态加载/卸载引擎, 并更新配置文件"""
     try:
         if enable:  # 加载引擎
             self.load_full_module(module)
             return self._config.update_module_state(module, True)
         else:  # 卸载引擎
             self.unload_full_module(module)
             return self._config.update_module_state(module, False)
     except ModuleNotFoundError:
         logger.error(f"Module not found: {module}")
         return False
Beispiel #12
0
 def get_anime_detail(self, meta: AnimeMetaInfo) -> AnimeDetailInfo:
     """解析一部番剧的详情页,返回包含视频列表的详细信息"""
     if not meta:
         logger.error(f"Invalid request")
         return AnimeDetailInfo()
     target_engine = self._engines.get(meta.engine)
     if target_engine is not None:
         return target_engine()._get_detail(meta.detail_page_url)
     # 如果引擎没加载, 临时加载一次
     logger.info(f"Engine not found: {meta.engine}, it will be loaded temporarily.")
     self._load_engine(meta.engine)
     target_engine = self._engines.pop(meta.engine)
     logger.info(f"Unloading engine: {target_engine}")
     return target_engine()._get_detail(meta.detail_page_url)
Beispiel #13
0
def save_resource(resource: Json, counts: Dict[str, int]) -> Dict[str, int]:
    try:
        if not resource["body"]:
            raise ValueError("Empty body")
        article_id = md5(resource["body"].encode()).hexdigest()
        article = Article(meta={"id": article_id}, **resource)
        article.save()
        counts["successful"] += 1

    except Exception as err:
        logger.error(err)
        counts["failed"] += 1

    counts["total"] += 1
    return counts
Beispiel #14
0
 async def _parse(self, raw_url: str) -> AnimeInfo:
     """解析直链, 捕获引擎模块未处理的异常"""
     try:
         await self._before_init()
         await self.init_session()
         info = await self.parse(raw_url)
         if not isinstance(info, AnimeInfo):
             info = AnimeInfo(info)  # 方便 parse 直接返回字符串链接
         await info.detect_more_info()
         if info.is_available():  # 解析成功
             logger.info(f"Parse success: {info}")
             logger.info(f"Real url: {info.real_url}")
             return info
         logger.error(f"Parse failed: {info}")
         return AnimeInfo()
     except Exception as e:
         logger.exception(e)
         return AnimeInfo()
Beispiel #15
0
    def detect_video_format(self) -> str:
        """判断视频真正的格式, url 可能没有视频后缀"""
        # 尝试从 url 提取后缀
        url = self.get_cached_real_url()
        try:
            ext = url.split("?")[0].split(".")[-1].lower()
            if ext in ["mp4", "flv"]:
                return ext
            if ext == "m3u8":
                return "hls"
        except (IndexError, AttributeError):
            pass

        # 视频的元数据中包含了视频的格式信息, 在视频开头寻找十六进制标识符推断视频格式
        format_hex = {
            "mp4":
            ["69736F6D", "70617663", "6D703432", "4D50454734", "4C617666"],
            "flv": ["464C56"],
            "hls": ["4558544D3355"]
        }

        _, data_iter = self._get_stream_from_server(0, 512)
        if not data_iter:
            logger.warning("Could not get video stream from server")
            return "unknown"

        logger.debug("Detecting video format from binary stream")
        video_meta = next(data_iter).hex().upper()
        for format_, hex_list in format_hex.items():
            for hex_sign in hex_list:
                if hex_sign in video_meta:
                    logger.debug(f"Video format: {format_}")
                    return format_
        logger.error("Could not detect video format from stream")
        logger.debug("Video raw binary stream (512byte):")
        logger.debug(video_meta)
        return "unknown"
Beispiel #16
0
 def get_danmaku(self, index: int) -> Optional[Danmaku]:
     try:
         return self._dmk_list[index]
     except IndexError:
         logger.error(f"IndexError, danmaku index: {index}")
         return None
Beispiel #17
0
def fetch_articles(
        terms: List[str],
        source_language: str,
        search_languages: Optional[List[str]] = None,
        output_language: Optional[str] = None,  # TODO: Unused
        country: Optional[str] = None,  # TODO: Unused
        sources: Optional[str] = None,  # TODO: Unused
        groupby_options: Optional[Json] = None) -> Json:

    if not type(terms) is list:
        raise TypeError(
            "terms must be a list of keywords eg: ['Donald Trump', 'Elections']"
        )

    query_terms = ",".join(terms)
    search_languages = search_languages or ["en"]
    if not type(search_languages) is list:
        raise TypeError(
            "search_languages must be a list of language codes eg: ['en', 'nl']"
        )

    translations = {}

    # Translate terms in english to minimise risk of missing IBM model
    if source_language != 'en':
        translations[source_language] = query_terms
        query_terms = translate(query_terms, source_language, 'en')

    translations['en'] = query_terms
    logger.debug(query_terms)

    search = Article.search()

    articles = []
    for lang in search_languages:
        if lang not in translations:
            translated = translate(query_terms, 'en', lang)
            if not translated:
                logger.error(f"Could not translate {query_terms} in {lang}")
                continue
            translations[lang] = translated
            logger.debug(translated)

        terms = translations[lang].split(",")
        minimum_should_match = int(0.5 * len(terms))

        query = Q('bool',
                  must=Q("match", language=lang),
                  minimum_should_match=minimum_should_match,
                  should=[
                      Q("multi_match",
                        fields=['body', 'title'],
                        type='phrase',
                        query=term.strip()) for term in terms
                  ])

        query = search.query(query)
        logger.debug(query.to_dict())

        results = query.execute()

        logger.info(f"Got {results.hits.total.value} hits")
        for hit in results:
            logger.info(f"{hit.meta.score}: {hit.title}")
            article = hit.to_dict()
            article["relevance"] = hit.meta.score
            articles.append(article)

    if groupby_options:
        articles = groupby_category(articles, **groupby_options)

    # if output_language is not None:
    #     results = translate_results(results, output_language)

    return {"articles": articles}