Ejemplo n.º 1
0
    def textrank_algorithm(self) -> None:

        G, d = self._graph, self.damping_factor
        k = len(G)
        outgoing = G.sum(0)
        scores = np.ones((k,)) * 1/k
        sse = lambda x, y: ((x - y)**2).sum()

        for step in range(10):

            newscores = np.empty((k,))
            for j in range(k):
                newscores[j] = d / k + (1-d) * np.sum([
                    scores[l] / outgoing[l] 
                    for l in range(k) 
                    if l != j and G[j,l] != 0
                ])

            logger.debug(f"{step} SSE:{sse(scores, newscores):.2e}")

            if sse(scores, newscores) < self.convergence_thresh:
                break

            scores = newscores

        self._scores = newscores
Ejemplo n.º 2
0
    def fit(self, tokens:List[Tuple[str,str,str]], sentences:Iterator[Any]) -> 'TextRank':

        logger.debug("Start TextRank analysis")
        pos_filter = lambda token: token[2] in self.INCLUDE_PART_OF_SPEECH
        tokens = list(filter(pos_filter, tokens))
        self.build_graph(tokens)
        self.textrank_algorithm()

        lemma2scores = dict(zip(self._lemmas, self._scores))
        self._lemma2word = l2w = {lemma: word for word, lemma, _ in tokens}
        word2scores = {l2w[lemma]: score for lemma, score in lemma2scores.items()}

        phrase2scores = self.reconstruct_phrases(word2scores, sentences)

        # Normalise and apply sigmoid function to the resulting scores
        weights = list(phrase2scores.values())
        mu, sigma = np.mean(weights), np.std(weights)
        norm = lambda weight: (weight - mu) / sigma
        sigmoid = lambda weight: (1 + np.exp(-weight))**(-1)
        scale = lambda weight: sigmoid(norm(weight))
        normalised_scores = {node: scale(weight) for node, weight in phrase2scores.items()}

        if not normalised_scores:
            raise ValueError("No keyword found! There might be something wrong with the input features.")
        
        self.keywords_ = pd.DataFrame(normalised_scores.items(), columns=["keyword", "score"])
        self.keywords_.sort_values("score", ascending=False, inplace=True)
        logger.debug(f"Top 5 keywords: {' '.join(self.keywords_.head(5)['keyword'].values)}")
        return self
Ejemplo n.º 3
0
    def test_ui(self):
        logger.debug(f"Start ui test @ {UI_LOCATION}")
        csrf_token = self._get_csrf_token()
        assert bool(csrf_token)

        for i, (_, text) in enumerate(load_texts()):
            self._test_post_request(text, csrf_token)
Ejemplo n.º 4
0
    def test_ui_under_pressure(self):
        logger.debug("Start hammering the server")
        queue = Queue()
        csrf_token = self._get_csrf_token()

        def threader():
            while True:
                text = queue.get()
                self._test_post_request(text, csrf_token)
                sleep(.5)
                queue.task_done()

        for _ in range(self.N_WORKERS):
            t = Thread(target=threader)
            t.daemon = True
            t.start()

        texts = list(load_texts("articles.txt"))

        i = 0
        while i < self.MAX_REQUESTS:
            i += 1
            _, text = random.choice(texts)
            queue.put(text)

        queue.join()
Ejemplo n.º 5
0
    async def init_session(self, session: Optional[ClientSession] = None):
        """
        初始化 ClientSession, 使用 get/post/head 方法之前需要调用一次,
        ClientSession 内部维护了连接池, 因此不建议每一个请求创建一个 session,
        这里默认为每一个类创建一个 persistent session, 或者手动设置一个, 以实现复用,
        在 __init__.py 中初始化 session 会出现 warning, 官方在 aiohttp 4.0 之后将只允许在协程中创建 session,
        See:

            https://github.com/aio-libs/aiohttp/issues/3658
            https://github.com/aio-libs/aiohttp/issues/4932

        :param session: 用于复用的 ClientSession 对象
        """
        if not self.session:
            if session:
                self.session = session
                return

            if self._dns_server:
                logger.debug(f"Use custom DNS Server: {self._dns_server}")
                resolver = AsyncResolver(nameservers=self._dns_server)
                con = TCPConnector(ssl=False,
                                   ttl_dns_cache=300,
                                   resolver=resolver)
            else:
                con = TCPConnector(ssl=False, ttl_dns_cache=300)

            jar = CookieJar(unsafe=True)
            self.session = ClientSession(connector=con, cookie_jar=jar)
Ejemplo n.º 6
0
    def test_analyser(self):
        analyser = TextAnalyser(related=False)
        self.assertIsInstance(analyser, TextAnalyser)

        for _, text in load_texts():

            try:

                analyser.fit(text)
                self.assertTrue(hasattr(analyser, 'textrank_'))
                self.assertIsInstance(analyser.textrank_, TextRank)
                self.assertTrue(hasattr(analyser, 'articles_'))

                output = analyser.to_dict()
                self.assertIs(type(output), dict)
                self.assertIn('articles', output)
                self.assertIn('graph', output)

                keywords = analyser.textrank_.get_keywords(max_kws=10)
                self.assertIs(type(keywords), list)
                self.assertTrue(all(type(kw) is dict for kw in keywords))
                logger.debug(str(keywords))

            except NLPModelNotFound as e:
                logger.error(e)
Ejemplo n.º 7
0
    def transmit(self, request):
        """百度统计转发"""
        args = dict(request.args)
        ref_page_u = args.get("u",
                              "")  # u = (file|http):///path/to/index.html#/
        ref_page_su = args.get("su", "")
        pat = re.compile(r".+?:///.+?/index\.html#(?P<route>/[^/]+).*"
                         )  # route= index|detail|tvlive|result
        args["u"] = pat.sub(rf"{self._flag_domain}\g<route>",
                            ref_page_u)  # 替换 index 文件路径为标记域名 host/route
        args["su"] = pat.sub(rf"{self._flag_domain}\g<route>", ref_page_su)

        cookies_str = ""
        for key, value in request.cookies.items():
            cookies_str += f"{key}={value};"

        stat_headers = {
            "User-Agent": request.headers.get("User-Agent"),
            "Referer": args["u"] or self._flag_domain,
            "Cookie": cookies_str,
        }
        logger.debug(args)
        logger.debug(stat_headers)
        resp = requests.get(self._hm_status_url,
                            params=args,
                            headers=stat_headers)
        return resp.content
Ejemplo n.º 8
0
    async def make_response(self, range_field: str = None):
        """
        读取远程的视频流,并伪装成本地的响应返回给客户端,
        206 连续请求会导致连接中断, asyncio 库在 Windows 平台触发 ConnectionAbortedError,
        偶尔出现 LocalProtocolError, 是 RFC2616 与 RFC7231 HEAD 请求冲突导致,
        See:

            https://bugs.python.org/issue26509
            https://gitlab.com/pgjones/quart/-/issues/45
        """
        if self._url.is_available():
            return Response("resource not available", status=404)

        if self._url.format == "hls":  # m3u8 不用代理
            return redirect(self._url.real_url)

        url = self._url.real_url
        proxy_headers = self._get_proxy_headers(url)
        if range_field is not None:
            proxy_headers["range"] = range_field
            logger.debug(f"Client request stream range: {range_field}")

        await self.init_session()
        resp = await self.get(url, headers=proxy_headers)
        if not resp:
            return Response(b"", status=404)

        if self._url.format == "hls":
            return redirect(url)  # url 不以 m3u8 结尾的跳过 Content-Type 识别

        @stream_with_context
        async def stream_iter():
            while chunk := await resp.content.read(4096):
                yield chunk
Ejemplo n.º 9
0
    async def make_response_with_range(self,
                                       range_field: str = None) -> Response:
        """
        读取远程的视频流,并伪装成本地的响应返回给客户端,
        206 连续请求会导致连接中断, asyncio 库在 Windows 平台触发 ConnectionAbortedError,
        偶尔出现 LocalProtocolError, 是 RFC2616 与 RFC7231 HEAD 请求冲突导致,
        See:

            https://bugs.python.org/issue26509
            https://gitlab.com/pgjones/quart/-/issues/45
        """
        url = self._info.real_url
        proxy_headers = self._get_proxy_headers(url)
        if range_field is not None:
            proxy_headers["range"] = range_field
            logger.debug(f"Client request stream range: {range_field}")

        await self.init_session()
        resp = await self.get(url, headers=proxy_headers)
        if not resp:
            return Response(b"", status=404)

        @stream_with_context
        async def stream_iter():
            while chunk := await resp.content.read(4096):
                yield chunk
Ejemplo n.º 10
0
 def _test_request(self, params):
     response = self._make_request(params)
     if "message" in response:
         logger.debug(response)
         return
     self.assertIs(type(response), dict)
     self.assertIn("articles", response)
     self._test_groups(response["articles"], params["groupby_options"])
Ejemplo n.º 11
0
 async def make_response_for_m3u8(self) -> Response:
     if not self._cache_m3u8_text:
         self._cache_m3u8_text = await self._get_fixed_m3u8_text()
         logger.debug(
             f"Cache m3u8 text, size: {len(self._cache_m3u8_text) // 1024}kb"
         )
     return Response(self._cache_m3u8_text,
                     mimetype="application/vnd.apple.mpegurl")
Ejemplo n.º 12
0
 def _make_request(self, params):
     resp = requests.post(self.ENDPOINT, json=params)
     try:
         return resp.json()
     except Exception as err:
         logger.exception(err)
         logger.debug(resp.text)
         raise err
Ejemplo n.º 13
0
    def _test_article(self, article):
        self.assertIs(type(article), dict)
        expected = [("source", dict), ("category", str), ("title", str),
                    ("body", str), ("publication_date", dt.datetime)]
        for key, typ in expected:
            self.assertIn(key, article)
            self.assertIs(type(article[key]), typ)

        logger.debug(article["title"])
        logger.debug(article["body"])
Ejemplo n.º 14
0
    def test_translate(self):
        translator = IBMTranslator()
        preds = translator.translate(
            "The parrot is in the cage.", 
            source="en", target="nl", 
            return_all=True
        )

        self.assertIs(type(preds), list)
        self.assertTrue(all(type(pred) is dict for pred in preds))
        logger.debug(preds[0])
Ejemplo n.º 15
0
 def store(self, obj: object) -> str:
     """储存一个对象,返回其 key"""
     if hasattr(obj, "hash"):
         key = obj.hash  # 如果对象自定义了 hash
     else:
         hash_str = str(id(obj))  # 临时计算一个
         key = md5(hash_str.encode("utf-8")).hexdigest()
     if key not in self._db:
         logger.debug(f"Store {obj} -> {key}")
         self._db[key] = obj
     return key
Ejemplo n.º 16
0
 def _test_post_request(self, text, csrf_token):
     params = {"text": text, "csrf_token": csrf_token}
     resp = requests.post(UI_LOCATION, params)
     resp.raise_for_status()
     soup = BeautifulSoup(resp.text, "html.parser")
     graph_container = soup.find("div", {"id": "graph-container"})
     if not graph_container:
         warnings.warn("No graph found")
     articles = soup.find_all("div", {"class": "thumbnail article"})
     title = text.strip().split('\n')[0]
     logger.debug(f"{title[:50]}... {len(articles)} results")
Ejemplo n.º 17
0
 def _test_article_format(self, articles):
     self.assertIs(type(articles), list)
     for article in articles:
         self.assertIsInstance(article, dict)
         expected = (
             "title", "body", "language", "relevance", 
             "image_url", "url", 
             "source", "category"
         )
         self.assertTrue(all(key in article for key in expected))
         logger.debug(article["title"])
Ejemplo n.º 18
0
 async def parse_danmaku_data(self, danmaku: Danmaku) -> DanmakuData:
     """解析一集弹幕的数据"""
     data_parser = self._loader.get_danmaku_data_parser(danmaku.module)
     logger.debug(f"{data_parser.__class__.__name__} parsing {danmaku.cid}")
     if data_parser is not None:
         start_time = perf_counter()
         data = await data_parser._parse(danmaku.cid)
         end_time = perf_counter()
         logger.info(f"Reading danmaku data finished in {end_time - start_time:.2f}s")
         return data
     return DanmakuData()
Ejemplo n.º 19
0
def load_model(lang: Optional[str] = None, path: Optional[str] = None) -> Any:
    if path is None:
        if not lang:
            raise ValueError("Must provide one of language or path to model")
        elif lang not in SPACY_LANG_MODELS:
            raise NLPModelNotFound(f"Model not available for {lang}")
        path = find_model(SPACY_LANG_MODELS[lang])
    logger.debug(f"Loading model {path}")
    t0 = time()
    nlp = spacy.load(path)
    logger.debug(f"Model loaded in {time() - t0:.2f}s")
    return nlp
Ejemplo n.º 20
0
 def build_anime_meta(self, hash_str: str):
     """尝试通过 hash 生成一个 AnimeMetaInfo"""
     try:
         engine, detail_page_url = b16decode(hash_str.upper()).decode("utf-8").split("|")
         meta = AnimeMetaInfo()
         meta.engine = engine
         meta.detail_page_url = detail_page_url
         logger.debug(f"Build AnimeMetaInfo from hash {hash_str}")
         logger.debug(f"engine: {engine} detail_page_url: {detail_page_url}")
         return meta
     except Exception:
         return
Ejemplo n.º 21
0
 def post(url: str,
          data=None,
          html_encoding="utf-8",
          **kwargs) -> requests.Response:
     """"封装 POST 方法, 默认网页编码为 utf-8"""
     try:
         logger.debug(f"url: {url}, data: {data}")
         kwargs.setdefault("timeout", 5)
         kwargs.setdefault("headers", HtmlParseHelper._headers)
         ret = requests.post(url, data, verify=False, **kwargs)
         ret.encoding = html_encoding
         return ret
     except requests.RequestException as e:
         logger.exception(e)
         return requests.Response()
Ejemplo n.º 22
0
 def get(url: str,
         params=None,
         html_encoding="utf-8",
         **kwargs) -> requests.Response:
     """封装 GET 方法, 默认网页编码为 utf-8"""
     try:
         logger.debug(f"url: {url}, params: {params}")
         kwargs.setdefault("timeout", 5)
         kwargs.setdefault("headers", HtmlParseHelper._headers)
         ret = requests.get(url, params, verify=False, **kwargs)
         ret.encoding = html_encoding  # 有些网页仍然使用 gb2312/gb18030 之类的编码, 需要单独设置
         return ret
     except requests.RequestException as e:
         logger.exception(e)
         return requests.Response()
Ejemplo n.º 23
0
 def wrapper(*args: Any, **kwargs: Any) -> Union[Json, List[Json]]:
     logger.info(f"{func.__name__.title()}: {args[1:]} {kwargs}")
     return_all = kwargs.pop("return_all", True)
     response = func(*args, **kwargs)
     result = response.get_result()
     top_level_key = json_key + "s"
     if top_level_key in result:
         predictions = result[top_level_key]
         prediction = predictions[0]
         if "confidence" in prediction:
             logger.debug(
                 "Language: {language} Confidence: {confidence:.2%}".
                 format(**prediction))
             predictions = merge_languages(predictions)
         return predictions if return_all else predictions[0][json_key]
     raise Exception("API Error: {}".format(result))
Ejemplo n.º 24
0
 async def head(self,
                url: str,
                params: dict = None,
                **kwargs) -> Optional[ClientResponse]:
     """
     HEAD 方法, 使用随机 User-Agent, 出现异常时返回 None
     """
     try:
         url = self.set_headers(url, kwargs)
         logger.debug(f"HEAD {url} | Params: {params} | Args: {kwargs}")
         resp = await self.session.head(url, params=params, **kwargs)
         logger.debug(
             f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})"
         )
         return resp
     except Exception as e:
         logger.warning(f"Exception in {self.__class__}: {e}")
Ejemplo n.º 25
0
 async def post(self,
                url: str,
                data: dict = None,
                **kwargs) -> Optional[ClientResponse]:
     """
     POST 方法, 使用随机 User-Agent, 出现异常时返回 None
     """
     try:
         url = self.set_headers(url, kwargs)
         logger.debug(f"POST {url} | Data: {data} | Args: {kwargs}")
         resp = await self.session.post(url, data=data, **kwargs)
         logger.debug(
             f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})"
         )
         return resp
     except Exception as e:
         logger.warning(f"Exception in {self.__class__}: {e}")
Ejemplo n.º 26
0
    def store(self, obj: Any, key: str = None, overwrite: bool = False) -> str:
        """
        存储一个对象, 返回其 key
        :param obj: 待存储的对象
        :param key: 若不指定, 随机生成一个运行期间不会重复的 key
        :param overwrite: 存在相同的 key 时是否覆盖
        :return: 对象的 key
        """
        if not key:
            hash_str = str(id(obj))
            key = md5(hash_str.encode("utf-8")).hexdigest()

        exist = key in self._db
        if (not exist) or (exist and overwrite):
            logger.debug(f"Store {obj} -> <Key {key}>")
            self._db[key] = obj
            return key
Ejemplo n.º 27
0
 def head(url: str,
          params=None,
          allow_redirects=True,
          **kwargs) -> requests.Response:
     """封装 HEAD 方法, 默认开启 302 重定向, 用于获取目标直链"""
     try:
         logger.debug(
             f"url: {url}, params: {params}, allow_redirects: {allow_redirects}"
         )
         kwargs.setdefault("timeout", 5)
         kwargs.setdefault("headers", HtmlParseHelper._headers)
         return requests.head(url,
                              params=params,
                              verify=False,
                              allow_redirects=allow_redirects,
                              **kwargs)
     except requests.RequestException as e:
         logger.exception(e)
         return requests.Response()
Ejemplo n.º 28
0
    def get_hm_js(self, localhost: str, cookies: dict) -> str:
        cookies_str = ""
        for key, value in cookies.items():
            cookies_str += f"{key}={value};"

        stat_headers = {
            "Referer": self._flag_domain,
            "Cookie": cookies_str,  # chrome blocked
        }
        logger.debug(stat_headers)
        resp = requests.get(self._hm_js_url, headers=stat_headers)
        if resp.status_code != 200:
            return ""
        localhost = localhost.replace("http://", "")  # ip:host
        text = resp.text.replace("https", "http") \
            .replace("hm.baidu.com/hm.gif", localhost + "/statistics") \
            .replace("hm.baidu.com", localhost) \
            .replace(f"{self._flag_domain}/statistics", localhost + "/statistics")
        return text
Ejemplo n.º 29
0
 def get_real_url(self) -> str:
     # detail_page_url: https://www.k1080.net/vodplay/410172-1-12.html
     sessions = requests.Session()
     resp = sessions.get(self.get_raw_url())
     if resp.status_code != 200:
         return ""
     player_data = re.search(r"player_data=({.+?\})", resp.text).group(1)
     player_data = json.loads(player_data)
     video_url = unquote(b64decode(player_data.get("url")).decode("utf8"))
     logger.debug(f"Video URL: {video_url}")
     if video_url.endswith(".mp4") or video_url.endswith(".m3u8"):
         return video_url
     if video_url.endswith(".html"):
         return ""
     # 需要再重定向一次
     resp = sessions.head(video_url, allow_redirects=False)
     if resp.status_code != 302:
         return ""
     return resp.headers.get("location", "")
Ejemplo n.º 30
0
    def pager(self,
              method: Callable,
              page_size: Optional[int] = 100,
              **kwargs: Any) -> Results:
        logger.debug(f"{method.__name__}: {kwargs}")

        n_articles = 0
        total_results = 1
        while n_articles < total_results:
            kwargs["page_size"] = page_size
            kwargs["page"] = n_articles // page_size + 1
            response = method(**kwargs)
            total_results = response["totalResults"]
            if not n_articles:
                logger.debug(f"{total_results} results")
            cat, lang = kwargs.get("category", ""), kwargs.get("language", "")
            yield from map(self.parse(cat, lang), response["articles"])
            n_articles += page_size
            break  # Developer accounts are limited to a max of 100 results.