class SyncAsyncFeedReader: def __init__(self, *args, **kwargs): self._loop = asyncio.get_event_loop() self._loop_run = self._loop.run_until_complete self._reader = AsyncFeedReader(*args, **kwargs) @property def has_rss_proxy(self): return self._reader.has_rss_proxy def read(self, *args, **kwargs): return self._loop_run(self._reader.read(*args, **kwargs)) def check_private_address(self, *args, **kwargs): return self._loop_run(self._reader.check_private_address(*args, **kwargs)) def __enter__(self): self._loop_run(self._reader.__aenter__()) return self def __exit__(self, *args): return self._loop_run(self._reader.__aexit__(*args)) def close(self): return self._loop_run(self._reader.close())
async def do_fetch_story( ctx: ActorContext, story_id: T.int, url: T.url, use_proxy: T.bool.default(False), ): LOG.info(f'fetch story#{story_id} url={unquote(url)} begin') async with AsyncFeedReader(**_get_proxy_options()) as reader: use_proxy = use_proxy and reader.has_rss_proxy status, response = await reader.read(url, use_proxy=use_proxy) if response and response.url: url = str(response.url) LOG.info( f'fetch story#{story_id} url={unquote(url)} status={status} finished') if not (response and status == 200): return if not response.rssant_text: msg = 'story#%s url=%s response text is empty!' LOG.error(msg, story_id, unquote(url)) return content = response.rssant_text if len(content) >= 1024 * 1024: content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story#%s size=%s url=%r' LOG.warning(msg, story_id, len(content), url) content = story_html_to_text(content) await ctx.hope('worker_rss.process_story_webpage', dict( story_id=story_id, url=url, text=content, ))
async def do_fetch_story( ctx: ActorContext, story_id: T.int, url: T.url, ): LOG.info(f'fetch story#{story_id} url={unquote(url)} begin') async with AsyncFeedReader() as reader: status, response = await reader.read(url) if response and response.url: url = str(response.url) LOG.info(f'fetch story#{story_id} url={unquote(url)} status={status} finished') if not (response and status == 200): return if not response.rssant_text: LOG.error(f'story#{story_id} url={unquote(url)} response text is empty!') return content = response.rssant_text if len(content) >= 1024 * 1024: content = story_html_clean(content) if len(content) >= 1024 * 1024: LOG.error(f'too large story#{story_id} size={len(content)} url={url}') await ctx.hope('worker_rss.process_story_webpage', dict( story_id=story_id, url=url, text=content, ))
async def check_private_address(url): if CONFIG.allow_private_address: return async with AsyncFeedReader() as reader: try: await reader.check_private_address(url) except PrivateAddressError: raise ImageProxyError('private address not allowed')
async def test_async_read_by_proxy(url): async with AsyncFeedReader( rss_proxy_url=CONFIG.rss_proxy_url, rss_proxy_token=CONFIG.rss_proxy_token, ) as reader: status, response = await reader.read(url, use_proxy=True) assert status == 200 assert response.status == 200 assert str(response.url) == url
async def do_detect_story_images( ctx: ActorContext, story_id: T.int, story_url: T.url, image_urls: T.list(T.url).unique, ): LOG.info( f'detect story images story_id={story_id} num_images={len(image_urls)} begin' ) async with AsyncFeedReader(allow_non_webpage=True) as reader: async def _read(url): if is_referer_deny_url(url): return url, FeedResponseStatus.REFERER_DENY.value status, response = await reader.read( url, referer="https://rss.anyant.com/", ignore_content=True) return url, status futs = [] for url in image_urls: futs.append(asyncio.ensure_future(_read(url))) t_begin = time.time() try: results = await asyncio.gather(*futs) except (TimeoutError, concurrent.futures.TimeoutError): results = [fut.result() for fut in futs if fut.done()] cost_ms = (time.time() - t_begin) * 1000 num_ok = num_error = 0 images = [] for url, status in results: if status == 200: num_ok += 1 else: num_error += 1 images.append(dict(url=url, status=status)) LOG.info(f'detect story images story_id={story_id} ' f'num_images={len(image_urls)} finished, ' f'ok={num_ok} error={num_error} cost={cost_ms:.0f}ms') await ctx.hope( 'harbor_rss.update_story_images', dict( story_id=story_id, story_url=story_url, images=images, ))
def __init__(self, *args, **kwargs): self._loop = asyncio.get_event_loop() self._loop_run = self._loop.run_until_complete self._reader = AsyncFeedReader(*args, **kwargs)
async def test_read(url): async with AsyncFeedReader() as reader: status, response = await reader.read(url) assert status == 200 assert response.status == 200 assert str(response.url) == url