def do_dns_service_refresh(ctx: ActorContext): DNS_SERVICE.refresh() records = {} for host, ip_set in DNS_SERVICE.records.items(): records[host] = list(ip_set) msg = dict(records=records) expire_at = time.time() + 60 * 60 for node in ctx.registery.remote_nodes: ctx.tell('actor.dns_service_update', msg, dst_node=node.name, expire_at=expire_at)
async def do_fetch_story( ctx: ActorContext, story_id: T.int, url: T.url, use_proxy: T.bool.default(False), num_sub_sentences: T.int.optional, ): LOG.info(f'fetch story#{story_id} url={unquote(url)} begin') options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) if DNS_SERVICE.is_resolved_url(url): use_proxy = False async with AsyncFeedReader(**options) as reader: use_proxy = use_proxy and reader.has_rss_proxy url_content = await _fetch_story(reader, story_id, url, use_proxy=use_proxy) if not url_content: return url, content = url_content if len(content) >= _MAX_STORY_HTML_LENGTH: content = story_html_clean(content) if len(content) >= _MAX_STORY_HTML_LENGTH: msg = 'too large story#%s size=%s url=%r' LOG.warning(msg, story_id, len(content), url) content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH] await ctx.hope( 'worker_rss.process_story_webpage', dict( story_id=story_id, url=url, text=content, num_sub_sentences=num_sub_sentences, ))
def _create_aiohttp_client_session(): loop = asyncio.get_event_loop() resolver = DNS_SERVICE.aiohttp_resolver(loop=loop) request_timeout = 30 session = aiohttp_client_session( resolver=resolver, timeout=request_timeout) return session
async def do_fetch_story( ctx: ActorContext, feed_id: T.int, offset: T.int.min(0), url: T.url, use_proxy: T.bool.default(False), num_sub_sentences: T.int.optional, ) -> SCHEMA_FETCH_STORY_RESULT: LOG.info(f'fetch story#{feed_id},{offset} url={unquote(url)} begin') options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False # make timeout less than actor default 30s to avoid ask timeout options.update(request_timeout=25) async with AsyncFeedReader(**options) as reader: use_proxy = use_proxy and reader.has_proxy url, content, response = await _fetch_story(reader, feed_id, offset, url, use_proxy=use_proxy) DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url, response_status=response.status, use_proxy=response.use_proxy) if not content: return DEFAULT_RESULT if len(content) >= _MAX_STORY_HTML_LENGTH: content = story_html_clean(content) if len(content) >= _MAX_STORY_HTML_LENGTH: msg = 'too large story#%s,%s size=%s url=%r' LOG.warning(msg, feed_id, offset, len(content), url) content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH] msg_func = ctx.ask if ctx.message.is_ask else ctx.hope result = await msg_func( 'worker_rss.process_story_webpage', dict( feed_id=feed_id, offset=offset, url=url, text=content, num_sub_sentences=num_sub_sentences, )) if not ctx.message.is_ask: return DEFAULT_RESULT result.update(DEFAULT_RESULT) return result
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), checksum_data: T.bytes.maxlen(4096).optional, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, is_refresh: T.bool.default(False), ): params = {} if not is_refresh: params = dict(etag=etag, last_modified=last_modified) options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False switch_prob = 0.25 # the prob of switch from use proxy to not use proxy with FeedReader(**options) as reader: use_proxy = reader.has_proxy and use_proxy if use_proxy and random.random() < switch_prob: use_proxy = False response = reader.read(url, **params, use_proxy=use_proxy) LOG.info( f'read feed#{feed_id} url={unquote(url)} status={response.status}') need_proxy = FeedResponseStatus.is_need_proxy(response.status) if (not use_proxy) and reader.has_proxy and need_proxy: LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}') proxy_response = reader.read(url, **params, use_proxy=True) LOG.info( f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}' ) if proxy_response.ok: response = proxy_response if (not response.ok) or (not response.content): status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR _update_feed_info(ctx, feed_id, status=status, response=response) return new_hash = compute_hash_base64(response.content) if (not is_refresh) and (new_hash == content_hash_base64): LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) _update_feed_info(ctx, feed_id, response=response) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) try: feed = _parse_found((response, raw_result), checksum_data=checksum_data, is_refresh=is_refresh) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
def do_dns_service_update(ctx, records: T.dict.key(T.str).value(T.list(T.str))): LOG.info('dns_service_update %r', records) DNS_SERVICE.update(records)
def test_dns_service_refresh(): DNS_SERVICE.refresh()
async def _async_test_dns_service_aiohttp(url): resolver = DNS_SERVICE.aiohttp_resolver() async with aiohttp_client_session(resolver=resolver) as session: async with session.get(url) as resp: assert resp.status == 200