def textrank_algorithm(self) -> None: G, d = self._graph, self.damping_factor k = len(G) outgoing = G.sum(0) scores = np.ones((k,)) * 1/k sse = lambda x, y: ((x - y)**2).sum() for step in range(10): newscores = np.empty((k,)) for j in range(k): newscores[j] = d / k + (1-d) * np.sum([ scores[l] / outgoing[l] for l in range(k) if l != j and G[j,l] != 0 ]) logger.debug(f"{step} SSE:{sse(scores, newscores):.2e}") if sse(scores, newscores) < self.convergence_thresh: break scores = newscores self._scores = newscores
def fit(self, tokens:List[Tuple[str,str,str]], sentences:Iterator[Any]) -> 'TextRank': logger.debug("Start TextRank analysis") pos_filter = lambda token: token[2] in self.INCLUDE_PART_OF_SPEECH tokens = list(filter(pos_filter, tokens)) self.build_graph(tokens) self.textrank_algorithm() lemma2scores = dict(zip(self._lemmas, self._scores)) self._lemma2word = l2w = {lemma: word for word, lemma, _ in tokens} word2scores = {l2w[lemma]: score for lemma, score in lemma2scores.items()} phrase2scores = self.reconstruct_phrases(word2scores, sentences) # Normalise and apply sigmoid function to the resulting scores weights = list(phrase2scores.values()) mu, sigma = np.mean(weights), np.std(weights) norm = lambda weight: (weight - mu) / sigma sigmoid = lambda weight: (1 + np.exp(-weight))**(-1) scale = lambda weight: sigmoid(norm(weight)) normalised_scores = {node: scale(weight) for node, weight in phrase2scores.items()} if not normalised_scores: raise ValueError("No keyword found! There might be something wrong with the input features.") self.keywords_ = pd.DataFrame(normalised_scores.items(), columns=["keyword", "score"]) self.keywords_.sort_values("score", ascending=False, inplace=True) logger.debug(f"Top 5 keywords: {' '.join(self.keywords_.head(5)['keyword'].values)}") return self
def test_ui(self): logger.debug(f"Start ui test @ {UI_LOCATION}") csrf_token = self._get_csrf_token() assert bool(csrf_token) for i, (_, text) in enumerate(load_texts()): self._test_post_request(text, csrf_token)
def test_ui_under_pressure(self): logger.debug("Start hammering the server") queue = Queue() csrf_token = self._get_csrf_token() def threader(): while True: text = queue.get() self._test_post_request(text, csrf_token) sleep(.5) queue.task_done() for _ in range(self.N_WORKERS): t = Thread(target=threader) t.daemon = True t.start() texts = list(load_texts("articles.txt")) i = 0 while i < self.MAX_REQUESTS: i += 1 _, text = random.choice(texts) queue.put(text) queue.join()
async def init_session(self, session: Optional[ClientSession] = None): """ 初始化 ClientSession, 使用 get/post/head 方法之前需要调用一次, ClientSession 内部维护了连接池, 因此不建议每一个请求创建一个 session, 这里默认为每一个类创建一个 persistent session, 或者手动设置一个, 以实现复用, 在 __init__.py 中初始化 session 会出现 warning, 官方在 aiohttp 4.0 之后将只允许在协程中创建 session, See: https://github.com/aio-libs/aiohttp/issues/3658 https://github.com/aio-libs/aiohttp/issues/4932 :param session: 用于复用的 ClientSession 对象 """ if not self.session: if session: self.session = session return if self._dns_server: logger.debug(f"Use custom DNS Server: {self._dns_server}") resolver = AsyncResolver(nameservers=self._dns_server) con = TCPConnector(ssl=False, ttl_dns_cache=300, resolver=resolver) else: con = TCPConnector(ssl=False, ttl_dns_cache=300) jar = CookieJar(unsafe=True) self.session = ClientSession(connector=con, cookie_jar=jar)
def test_analyser(self): analyser = TextAnalyser(related=False) self.assertIsInstance(analyser, TextAnalyser) for _, text in load_texts(): try: analyser.fit(text) self.assertTrue(hasattr(analyser, 'textrank_')) self.assertIsInstance(analyser.textrank_, TextRank) self.assertTrue(hasattr(analyser, 'articles_')) output = analyser.to_dict() self.assertIs(type(output), dict) self.assertIn('articles', output) self.assertIn('graph', output) keywords = analyser.textrank_.get_keywords(max_kws=10) self.assertIs(type(keywords), list) self.assertTrue(all(type(kw) is dict for kw in keywords)) logger.debug(str(keywords)) except NLPModelNotFound as e: logger.error(e)
def transmit(self, request): """百度统计转发""" args = dict(request.args) ref_page_u = args.get("u", "") # u = (file|http):///path/to/index.html#/ ref_page_su = args.get("su", "") pat = re.compile(r".+?:///.+?/index\.html#(?P<route>/[^/]+).*" ) # route= index|detail|tvlive|result args["u"] = pat.sub(rf"{self._flag_domain}\g<route>", ref_page_u) # 替换 index 文件路径为标记域名 host/route args["su"] = pat.sub(rf"{self._flag_domain}\g<route>", ref_page_su) cookies_str = "" for key, value in request.cookies.items(): cookies_str += f"{key}={value};" stat_headers = { "User-Agent": request.headers.get("User-Agent"), "Referer": args["u"] or self._flag_domain, "Cookie": cookies_str, } logger.debug(args) logger.debug(stat_headers) resp = requests.get(self._hm_status_url, params=args, headers=stat_headers) return resp.content
async def make_response(self, range_field: str = None): """ 读取远程的视频流,并伪装成本地的响应返回给客户端, 206 连续请求会导致连接中断, asyncio 库在 Windows 平台触发 ConnectionAbortedError, 偶尔出现 LocalProtocolError, 是 RFC2616 与 RFC7231 HEAD 请求冲突导致, See: https://bugs.python.org/issue26509 https://gitlab.com/pgjones/quart/-/issues/45 """ if self._url.is_available(): return Response("resource not available", status=404) if self._url.format == "hls": # m3u8 不用代理 return redirect(self._url.real_url) url = self._url.real_url proxy_headers = self._get_proxy_headers(url) if range_field is not None: proxy_headers["range"] = range_field logger.debug(f"Client request stream range: {range_field}") await self.init_session() resp = await self.get(url, headers=proxy_headers) if not resp: return Response(b"", status=404) if self._url.format == "hls": return redirect(url) # url 不以 m3u8 结尾的跳过 Content-Type 识别 @stream_with_context async def stream_iter(): while chunk := await resp.content.read(4096): yield chunk
async def make_response_with_range(self, range_field: str = None) -> Response: """ 读取远程的视频流,并伪装成本地的响应返回给客户端, 206 连续请求会导致连接中断, asyncio 库在 Windows 平台触发 ConnectionAbortedError, 偶尔出现 LocalProtocolError, 是 RFC2616 与 RFC7231 HEAD 请求冲突导致, See: https://bugs.python.org/issue26509 https://gitlab.com/pgjones/quart/-/issues/45 """ url = self._info.real_url proxy_headers = self._get_proxy_headers(url) if range_field is not None: proxy_headers["range"] = range_field logger.debug(f"Client request stream range: {range_field}") await self.init_session() resp = await self.get(url, headers=proxy_headers) if not resp: return Response(b"", status=404) @stream_with_context async def stream_iter(): while chunk := await resp.content.read(4096): yield chunk
def _test_request(self, params): response = self._make_request(params) if "message" in response: logger.debug(response) return self.assertIs(type(response), dict) self.assertIn("articles", response) self._test_groups(response["articles"], params["groupby_options"])
async def make_response_for_m3u8(self) -> Response: if not self._cache_m3u8_text: self._cache_m3u8_text = await self._get_fixed_m3u8_text() logger.debug( f"Cache m3u8 text, size: {len(self._cache_m3u8_text) // 1024}kb" ) return Response(self._cache_m3u8_text, mimetype="application/vnd.apple.mpegurl")
def _make_request(self, params): resp = requests.post(self.ENDPOINT, json=params) try: return resp.json() except Exception as err: logger.exception(err) logger.debug(resp.text) raise err
def _test_article(self, article): self.assertIs(type(article), dict) expected = [("source", dict), ("category", str), ("title", str), ("body", str), ("publication_date", dt.datetime)] for key, typ in expected: self.assertIn(key, article) self.assertIs(type(article[key]), typ) logger.debug(article["title"]) logger.debug(article["body"])
def test_translate(self): translator = IBMTranslator() preds = translator.translate( "The parrot is in the cage.", source="en", target="nl", return_all=True ) self.assertIs(type(preds), list) self.assertTrue(all(type(pred) is dict for pred in preds)) logger.debug(preds[0])
def store(self, obj: object) -> str: """储存一个对象,返回其 key""" if hasattr(obj, "hash"): key = obj.hash # 如果对象自定义了 hash else: hash_str = str(id(obj)) # 临时计算一个 key = md5(hash_str.encode("utf-8")).hexdigest() if key not in self._db: logger.debug(f"Store {obj} -> {key}") self._db[key] = obj return key
def _test_post_request(self, text, csrf_token): params = {"text": text, "csrf_token": csrf_token} resp = requests.post(UI_LOCATION, params) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") graph_container = soup.find("div", {"id": "graph-container"}) if not graph_container: warnings.warn("No graph found") articles = soup.find_all("div", {"class": "thumbnail article"}) title = text.strip().split('\n')[0] logger.debug(f"{title[:50]}... {len(articles)} results")
def _test_article_format(self, articles): self.assertIs(type(articles), list) for article in articles: self.assertIsInstance(article, dict) expected = ( "title", "body", "language", "relevance", "image_url", "url", "source", "category" ) self.assertTrue(all(key in article for key in expected)) logger.debug(article["title"])
async def parse_danmaku_data(self, danmaku: Danmaku) -> DanmakuData: """解析一集弹幕的数据""" data_parser = self._loader.get_danmaku_data_parser(danmaku.module) logger.debug(f"{data_parser.__class__.__name__} parsing {danmaku.cid}") if data_parser is not None: start_time = perf_counter() data = await data_parser._parse(danmaku.cid) end_time = perf_counter() logger.info(f"Reading danmaku data finished in {end_time - start_time:.2f}s") return data return DanmakuData()
def load_model(lang: Optional[str] = None, path: Optional[str] = None) -> Any: if path is None: if not lang: raise ValueError("Must provide one of language or path to model") elif lang not in SPACY_LANG_MODELS: raise NLPModelNotFound(f"Model not available for {lang}") path = find_model(SPACY_LANG_MODELS[lang]) logger.debug(f"Loading model {path}") t0 = time() nlp = spacy.load(path) logger.debug(f"Model loaded in {time() - t0:.2f}s") return nlp
def build_anime_meta(self, hash_str: str): """尝试通过 hash 生成一个 AnimeMetaInfo""" try: engine, detail_page_url = b16decode(hash_str.upper()).decode("utf-8").split("|") meta = AnimeMetaInfo() meta.engine = engine meta.detail_page_url = detail_page_url logger.debug(f"Build AnimeMetaInfo from hash {hash_str}") logger.debug(f"engine: {engine} detail_page_url: {detail_page_url}") return meta except Exception: return
def post(url: str, data=None, html_encoding="utf-8", **kwargs) -> requests.Response: """"封装 POST 方法, 默认网页编码为 utf-8""" try: logger.debug(f"url: {url}, data: {data}") kwargs.setdefault("timeout", 5) kwargs.setdefault("headers", HtmlParseHelper._headers) ret = requests.post(url, data, verify=False, **kwargs) ret.encoding = html_encoding return ret except requests.RequestException as e: logger.exception(e) return requests.Response()
def get(url: str, params=None, html_encoding="utf-8", **kwargs) -> requests.Response: """封装 GET 方法, 默认网页编码为 utf-8""" try: logger.debug(f"url: {url}, params: {params}") kwargs.setdefault("timeout", 5) kwargs.setdefault("headers", HtmlParseHelper._headers) ret = requests.get(url, params, verify=False, **kwargs) ret.encoding = html_encoding # 有些网页仍然使用 gb2312/gb18030 之类的编码, 需要单独设置 return ret except requests.RequestException as e: logger.exception(e) return requests.Response()
def wrapper(*args: Any, **kwargs: Any) -> Union[Json, List[Json]]: logger.info(f"{func.__name__.title()}: {args[1:]} {kwargs}") return_all = kwargs.pop("return_all", True) response = func(*args, **kwargs) result = response.get_result() top_level_key = json_key + "s" if top_level_key in result: predictions = result[top_level_key] prediction = predictions[0] if "confidence" in prediction: logger.debug( "Language: {language} Confidence: {confidence:.2%}". format(**prediction)) predictions = merge_languages(predictions) return predictions if return_all else predictions[0][json_key] raise Exception("API Error: {}".format(result))
async def head(self, url: str, params: dict = None, **kwargs) -> Optional[ClientResponse]: """ HEAD 方法, 使用随机 User-Agent, 出现异常时返回 None """ try: url = self.set_headers(url, kwargs) logger.debug(f"HEAD {url} | Params: {params} | Args: {kwargs}") resp = await self.session.head(url, params=params, **kwargs) logger.debug( f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})" ) return resp except Exception as e: logger.warning(f"Exception in {self.__class__}: {e}")
async def post(self, url: str, data: dict = None, **kwargs) -> Optional[ClientResponse]: """ POST 方法, 使用随机 User-Agent, 出现异常时返回 None """ try: url = self.set_headers(url, kwargs) logger.debug(f"POST {url} | Data: {data} | Args: {kwargs}") resp = await self.session.post(url, data=data, **kwargs) logger.debug( f"Code: {resp.status} | Type: {resp.content_type} | Length: {resp.content_length} ({url})" ) return resp except Exception as e: logger.warning(f"Exception in {self.__class__}: {e}")
def store(self, obj: Any, key: str = None, overwrite: bool = False) -> str: """ 存储一个对象, 返回其 key :param obj: 待存储的对象 :param key: 若不指定, 随机生成一个运行期间不会重复的 key :param overwrite: 存在相同的 key 时是否覆盖 :return: 对象的 key """ if not key: hash_str = str(id(obj)) key = md5(hash_str.encode("utf-8")).hexdigest() exist = key in self._db if (not exist) or (exist and overwrite): logger.debug(f"Store {obj} -> <Key {key}>") self._db[key] = obj return key
def head(url: str, params=None, allow_redirects=True, **kwargs) -> requests.Response: """封装 HEAD 方法, 默认开启 302 重定向, 用于获取目标直链""" try: logger.debug( f"url: {url}, params: {params}, allow_redirects: {allow_redirects}" ) kwargs.setdefault("timeout", 5) kwargs.setdefault("headers", HtmlParseHelper._headers) return requests.head(url, params=params, verify=False, allow_redirects=allow_redirects, **kwargs) except requests.RequestException as e: logger.exception(e) return requests.Response()
def get_hm_js(self, localhost: str, cookies: dict) -> str: cookies_str = "" for key, value in cookies.items(): cookies_str += f"{key}={value};" stat_headers = { "Referer": self._flag_domain, "Cookie": cookies_str, # chrome blocked } logger.debug(stat_headers) resp = requests.get(self._hm_js_url, headers=stat_headers) if resp.status_code != 200: return "" localhost = localhost.replace("http://", "") # ip:host text = resp.text.replace("https", "http") \ .replace("hm.baidu.com/hm.gif", localhost + "/statistics") \ .replace("hm.baidu.com", localhost) \ .replace(f"{self._flag_domain}/statistics", localhost + "/statistics") return text
def get_real_url(self) -> str: # detail_page_url: https://www.k1080.net/vodplay/410172-1-12.html sessions = requests.Session() resp = sessions.get(self.get_raw_url()) if resp.status_code != 200: return "" player_data = re.search(r"player_data=({.+?\})", resp.text).group(1) player_data = json.loads(player_data) video_url = unquote(b64decode(player_data.get("url")).decode("utf8")) logger.debug(f"Video URL: {video_url}") if video_url.endswith(".mp4") or video_url.endswith(".m3u8"): return video_url if video_url.endswith(".html"): return "" # 需要再重定向一次 resp = sessions.head(video_url, allow_redirects=False) if resp.status_code != 302: return "" return resp.headers.get("location", "")
def pager(self, method: Callable, page_size: Optional[int] = 100, **kwargs: Any) -> Results: logger.debug(f"{method.__name__}: {kwargs}") n_articles = 0 total_results = 1 while n_articles < total_results: kwargs["page_size"] = page_size kwargs["page"] = n_articles // page_size + 1 response = method(**kwargs) total_results = response["totalResults"] if not n_articles: logger.debug(f"{total_results} results") cat, lang = kwargs.get("category", ""), kwargs.get("language", "") yield from map(self.parse(cat, lang), response["articles"]) n_articles += page_size break # Developer accounts are limited to a max of 100 results.