Ejemplo n.º 1
0
class SyncAsyncFeedReader:
    def __init__(self, *args, **kwargs):
        self._loop = asyncio.get_event_loop()
        self._loop_run = self._loop.run_until_complete
        self._reader = AsyncFeedReader(*args, **kwargs)

    @property
    def has_rss_proxy(self):
        return self._reader.has_rss_proxy

    def read(self, *args, **kwargs):
        return self._loop_run(self._reader.read(*args, **kwargs))

    def check_private_address(self, *args, **kwargs):
        return self._loop_run(self._reader.check_private_address(*args, **kwargs))

    def __enter__(self):
        self._loop_run(self._reader.__aenter__())
        return self

    def __exit__(self, *args):
        return self._loop_run(self._reader.__aexit__(*args))

    def close(self):
        return self._loop_run(self._reader.close())
Ejemplo n.º 2
0
async def do_fetch_story(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        use_proxy: T.bool.default(False),
):
    LOG.info(f'fetch story#{story_id} url={unquote(url)} begin')
    async with AsyncFeedReader(**_get_proxy_options()) as reader:
        use_proxy = use_proxy and reader.has_rss_proxy
        status, response = await reader.read(url, use_proxy=use_proxy)
    if response and response.url:
        url = str(response.url)
    LOG.info(
        f'fetch story#{story_id} url={unquote(url)} status={status} finished')
    if not (response and status == 200):
        return
    if not response.rssant_text:
        msg = 'story#%s url=%s response text is empty!'
        LOG.error(msg, story_id, unquote(url))
        return
    content = response.rssant_text
    if len(content) >= 1024 * 1024:
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story#%s size=%s url=%r'
            LOG.warning(msg, story_id, len(content), url)
            content = story_html_to_text(content)
    await ctx.hope('worker_rss.process_story_webpage',
                   dict(
                       story_id=story_id,
                       url=url,
                       text=content,
                   ))
Ejemplo n.º 3
0
async def do_fetch_story(
    ctx: ActorContext,
    story_id: T.int,
    url: T.url,
):
    LOG.info(f'fetch story#{story_id} url={unquote(url)} begin')
    async with AsyncFeedReader() as reader:
        status, response = await reader.read(url)
    if response and response.url:
        url = str(response.url)
    LOG.info(f'fetch story#{story_id} url={unquote(url)} status={status} finished')
    if not (response and status == 200):
        return
    if not response.rssant_text:
        LOG.error(f'story#{story_id} url={unquote(url)} response text is empty!')
        return
    content = response.rssant_text
    if len(content) >= 1024 * 1024:
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            LOG.error(f'too large story#{story_id} size={len(content)} url={url}')
    await ctx.hope('worker_rss.process_story_webpage', dict(
        story_id=story_id,
        url=url,
        text=content,
    ))
Ejemplo n.º 4
0
async def check_private_address(url):
    if CONFIG.allow_private_address:
        return
    async with AsyncFeedReader() as reader:
        try:
            await reader.check_private_address(url)
        except PrivateAddressError:
            raise ImageProxyError('private address not allowed')
Ejemplo n.º 5
0
async def test_async_read_by_proxy(url):
    async with AsyncFeedReader(
            rss_proxy_url=CONFIG.rss_proxy_url,
            rss_proxy_token=CONFIG.rss_proxy_token,
    ) as reader:
        status, response = await reader.read(url, use_proxy=True)
    assert status == 200
    assert response.status == 200
    assert str(response.url) == url
Ejemplo n.º 6
0
async def do_detect_story_images(
    ctx: ActorContext,
    story_id: T.int,
    story_url: T.url,
    image_urls: T.list(T.url).unique,
):
    LOG.info(
        f'detect story images story_id={story_id} num_images={len(image_urls)} begin'
    )
    async with AsyncFeedReader(allow_non_webpage=True) as reader:

        async def _read(url):
            if is_referer_deny_url(url):
                return url, FeedResponseStatus.REFERER_DENY.value
            status, response = await reader.read(
                url, referer="https://rss.anyant.com/", ignore_content=True)
            return url, status

        futs = []
        for url in image_urls:
            futs.append(asyncio.ensure_future(_read(url)))
        t_begin = time.time()
        try:
            results = await asyncio.gather(*futs)
        except (TimeoutError, concurrent.futures.TimeoutError):
            results = [fut.result() for fut in futs if fut.done()]
        cost_ms = (time.time() - t_begin) * 1000
    num_ok = num_error = 0
    images = []
    for url, status in results:
        if status == 200:
            num_ok += 1
        else:
            num_error += 1
        images.append(dict(url=url, status=status))
    LOG.info(f'detect story images story_id={story_id} '
             f'num_images={len(image_urls)} finished, '
             f'ok={num_ok} error={num_error} cost={cost_ms:.0f}ms')
    await ctx.hope(
        'harbor_rss.update_story_images',
        dict(
            story_id=story_id,
            story_url=story_url,
            images=images,
        ))
Ejemplo n.º 7
0
 def __init__(self, *args, **kwargs):
     self._loop = asyncio.get_event_loop()
     self._loop_run = self._loop.run_until_complete
     self._reader = AsyncFeedReader(*args, **kwargs)
Ejemplo n.º 8
0
async def test_read(url):
    async with AsyncFeedReader() as reader:
        status, response = await reader.read(url)
    assert status == 200
    assert response.status == 200
    assert str(response.url) == url