class UrlsManager: """ 内部记录每个url的运行信息 并保存最终运行结果 """ class _counter: """ 内部计数的自定义类, 维护一个namedtuple[Counter] """ def __new__(cls, fields): cls.fields = fields._fields_defaults return super().__new__(cls) def __init__(self, fields: Counter): # _dict用于实际保存并计数 f = fields._fields self._dict = dict(zip(f, [0 for i in range(len(f))])) def __setattr__(self, name, value): """所有的赋值操作都会调用""" # 阻止对_dict的直接赋值 if (name == '_dict' and hasattr(self, '_dict') and isinstance(getattr(self, '_dict'), dict)): raise ValueError(f' Forbidden to modify attribute:[{name}]') if name == '_dict': # 本实现将阻止除了更新计数之外的其它设值及增加属性,模拟了namedtuple抛出异常 super().__setattr__(name, value) elif name in self._dict: self._dict[name] = value else: raise ValueError(f' Got unexpected field names:[{name}]') def __getattribute__(self, name): """ __getattribute__在任何属性查找操作中都会调用(包含特殊属性),所以注意以下要super调用 否则会陷入无限递归调用. __getattr__方法则是在本身及其类上查找不到才会调用 """ # 注意特殊属性及类自身属性.实际应用时应注意 if name in super().__getattribute__('_dict'): return super().__getattribute__('_dict')[name] else: return super().__getattribute__(name) def __delattr__(self, name): """拦截了所有删除操作""" raise ValueError(f' Forbidden to delete attribute:[{name}]') def add(self, n: Counter): """ 使用数值累加计数器 """ for key in n._fields: v = getattr(n, key) if v: self._dict[key] += v def update(self, n: Counter): v = None for key in n._fields: v = getattr(n, key) if v: self._dict[key] = v @property def values(self): return Counter(**self._dict) def __init__(self, max_tried_times: int = 3): """ 参数:max_tried_times 为任务失败前最大尝试次数,需为大于0的整数 """ self.max_tried_times = max_tried_times # 以下Queue必须在主事件循环event loop确定后再建立, self.__queue_todo = None # 保存待完成的任务 # 其构造函数为Queue(maxsize=0, *, loop=None),不指定loop时 # 内部获取的event loop可能与主event loop不同,则join方法会产生runtime error self.__total_key_urls = defaultdict( int) # 用于过滤url并保存处理次数,键为url,值为已处理次数 self.__done_urls = set() # 用于保存完成记录 self.__failed_urls = set() # 用于保存每次失败记录 self.__working_urls = set() # 用于保存正在处理的url self.__discarded_urls = set() # 保存丢弃的url及丢弃次数 self.__counter = self._counter(Counter) # 维护内部处理计数器 async def prepare(self) -> bool: """需要控制时间的初始化或资源准备""" self.__queue_todo = PriorityQueue() return True async def put(self, url: str or URL) -> Counter: """ 向UlrsManager发送1条url 参数:url为字符串 返回:Counter 指示此次添加的url处理方式 此方法在内部对url有两种处理方式:put进任务队列或discard """ url = URL(url) put = discarded = 0 # 传递给计数器_counter times = self.__total_key_urls[url] _todo = {url: rank for rank, url in self.__queue_todo._queue} # 将优先队列中数据转换成字典形式 # 如果达到尝试次数或者已在任务中则丢弃它 if (times >= self.max_tried_times or url in itertools.chain( _todo, self.__working_urls, self.__done_urls)): self.__discarded_urls.add(url) discarded = 1 else: self.__queue_todo.put_nowait((times, url)) put = 1 c = Counter(count=1, put=put, discarded=discarded) self.__counter.add(c) # 更新计数器 return c async def put_urls(self, urls: Iterable) -> Counter: """ 向UlrsManager发送多条url 参数:urls:为列表,元组或集合 返回:urls的处理情况计数 """ c = self._counter(Counter) for url in urls: c.add(await self.put(url)) return c async def task_done(self, url: str or URL, is_OK=True) -> Counter: """ 通知UrlsManager本url的任务处理已经结束 参数:is_OK 标识处理是否正常完成 此方法在内部对url做完成或失败处理,并从working池中移除 """ self.__working_urls.remove(url) self.__total_key_urls[url] = self.__total_key_urls[url] + 1 if is_OK: self.__done_urls.add(url) c = Counter(done=1) self.__counter.add(c) self.__queue_todo.task_done() return c else: times = self.__total_key_urls[url] if times >= self.max_tried_times: self.__failed_urls.add(url) c = Counter(failed=1) self.__counter.add(c) else: c = await self.put(url) self.__queue_todo.task_done() return c async def get(self) -> str: """从UrlsManager中获取1个url""" urlItem = await self.__queue_todo.get() # 优先队列中存储的为tuple self.__working_urls.add(urlItem[1]) return urlItem[1] async def get_urls(self, qty: int) -> tuple: """取得多个url""" async def join(self) -> bool: """阻塞执行线程直到UrlsManager中的url全部被取出并处理完 """ await self.__queue_todo.join() return True async def get_todo(self): """得到待处理的url元组""" _todo = self.__queue_todo._queue _todo = (url for v, url in _todo) return tuple(_todo) async def get_results(self): #count put discarded done failed todo working _todo = self.__queue_todo._queue _todo = (url for v, url in _todo) _todo_urls = set(_todo) results = namedtuple( 'results', 'key_urls discarded_urls done_urls failed_urls todo_urls working_urls' ) results = results(self.__total_key_urls, self.__discarded_urls, self.__done_urls, self.__failed_urls, _todo_urls, self.__working_urls) return results async def get_count(self) -> Counter: todo = len(self.__queue_todo._queue) working = len(self.__working_urls) ikeys = len(self.__total_key_urls) isum = sum(self.__total_key_urls.values()) imin = min(self.__total_key_urls.values()) imax = max(self.__total_key_urls.values()) c = Counter(todo=todo, working=working, keys=ikeys, sum=isum, min=imin, max=imax) self.__counter.update(c) return self.__counter.values
class BrokerHandler(BrokerHandlerSetup): """Broker Handler class.""" __slots__ = "_handlers", "_records", "_retry", "_queue", "_consumers", "_consumer_concurrency" def __init__( self, records: int, handlers: dict[str, Optional[Callable]], retry: int, publisher: BrokerPublisher, consumer_concurrency: int = 15, **kwargs: Any, ): super().__init__(**kwargs) self._handlers = handlers self._records = records self._retry = retry self._queue = PriorityQueue(maxsize=self._records) self._consumers: list[Task] = list() self._consumer_concurrency = consumer_concurrency self._publisher = publisher @classmethod def _from_config(cls, config: MinosConfig, **kwargs) -> BrokerHandler: kwargs["handlers"] = cls._get_handlers(config, **kwargs) kwargs["publisher"] = cls._get_publisher(**kwargs) # noinspection PyProtectedMember return cls(**config.broker.queue._asdict(), **kwargs) @staticmethod def _get_handlers( config: MinosConfig, handlers: dict[str, Optional[Callable]] = None, **kwargs ) -> dict[str, Callable[[BrokerRequest], Awaitable[Optional[BrokerResponse]]]]: if handlers is None: builder = EnrouteBuilder(*config.services, middleware=config.middleware) decorators = builder.get_broker_command_query_event(config=config, **kwargs) handlers = {decorator.topic: fn for decorator, fn in decorators.items()} return handlers # noinspection PyUnusedLocal @staticmethod @inject def _get_publisher( publisher: Optional[BrokerPublisher] = None, broker_publisher: BrokerPublisher = Provide["broker_publisher"], **kwargs, ) -> BrokerPublisher: if publisher is None: publisher = broker_publisher if publisher is None or isinstance(publisher, Provide): raise NotProvidedException(f"A {BrokerPublisher!r} object must be provided.") return publisher async def _setup(self) -> None: await super()._setup() await self._create_consumers() async def _destroy(self) -> None: await self._destroy_consumers() await super()._destroy() async def _create_consumers(self): while len(self._consumers) < self._consumer_concurrency: self._consumers.append(create_task(self._consume())) async def _destroy_consumers(self): for consumer in self._consumers: consumer.cancel() await gather(*self._consumers, return_exceptions=True) self._consumers = list() while not self._queue.empty(): entry = self._queue.get_nowait() await self.submit_query(self._queries["update_not_processed"], (entry.id,)) async def _consume(self) -> None: while True: await self._consume_one() async def _consume_one(self) -> None: entry = await self._queue.get() try: await self._dispatch_one(entry) finally: self._queue.task_done() @property def publisher(self) -> BrokerPublisher: """Get the publisher instance. :return: A ``BrokerPublisher`` instance. """ return self._publisher @property def consumers(self) -> list[Task]: """Get the consumers. :return: A list of ``Task`` instances. """ return self._consumers @property def handlers(self) -> dict[str, Optional[Callable]]: """Handlers getter. :return: A dictionary in which the keys are topics and the values are the handler. """ return self._handlers @property def topics(self) -> KeysView[str]: """Get an iterable containing the topic names. :return: An ``Iterable`` of ``str``. """ return self.handlers.keys() async def dispatch_forever(self, max_wait: Optional[float] = 60.0) -> NoReturn: """Dispatch the items in the consuming queue forever. :param max_wait: Maximum seconds to wait for notifications. If ``None`` the wait is performed until infinity. :return: This method does not return anything. """ async with self.cursor() as cursor: await self._listen_entries(cursor) try: while True: await self._wait_for_entries(cursor, max_wait) await self.dispatch(cursor, background_mode=True) finally: await self._unlisten_entries(cursor) async def _listen_entries(self, cursor: Cursor): for topic in self.topics: # noinspection PyTypeChecker await cursor.execute(_LISTEN_QUERY.format(Identifier(topic))) async def _unlisten_entries(self, cursor: Cursor) -> None: for topic in self.topics: # noinspection PyTypeChecker await cursor.execute(_UNLISTEN_QUERY.format(Identifier(topic))) async def _wait_for_entries(self, cursor: Cursor, max_wait: Optional[float]) -> None: if await self._get_count(cursor): return while True: try: return await wait_for(consume_queue(cursor.connection.notifies, self._records), max_wait) except TimeoutError: if await self._get_count(cursor): return async def _get_count(self, cursor) -> int: if not len(self.topics): return 0 await cursor.execute(_COUNT_NOT_PROCESSED_QUERY, (self._retry, tuple(self.topics))) count = (await cursor.fetchone())[0] return count async def dispatch(self, cursor: Optional[Cursor] = None, background_mode: bool = False) -> None: """Dispatch a batch of ``HandlerEntry`` instances from the database's queue. :param cursor: The cursor to interact with the database. If ``None`` is provided a new one is acquired. :param background_mode: If ``True`` the entries dispatching waits until every entry is processed. Otherwise, the dispatching is performed on background. :return: This method does not return anything. """ is_external_cursor = cursor is not None if not is_external_cursor: cursor = await self.cursor().__aenter__() async with cursor.begin(): await cursor.execute( self._queries["select_not_processed"], (self._retry, tuple(self.topics), self._records) ) result = await cursor.fetchall() if len(result): entries = self._build_entries(result) await cursor.execute(self._queries["mark_processing"], (tuple(e.id for e in entries),)) for entry in entries: await self._queue.put(entry) if not is_external_cursor: await cursor.__aexit__(None, None, None) if not background_mode: await self._queue.join() def _build_entries(self, rows: list[tuple]) -> list[BrokerHandlerEntry]: kwargs = {"callback_lookup": self.get_action} return [BrokerHandlerEntry(*row, **kwargs) for row in rows] async def _dispatch_one(self, entry: BrokerHandlerEntry) -> None: logger.debug(f"Dispatching '{entry!r}'...") try: await self.dispatch_one(entry) except (CancelledError, Exception) as exc: logger.warning(f"Raised an exception while dispatching {entry!r}: {exc!r}") entry.exception = exc if isinstance(exc, CancelledError): raise exc finally: query_id = "delete_processed" if entry.success else "update_not_processed" await self.submit_query(self._queries[query_id], (entry.id,)) async def dispatch_one(self, entry: BrokerHandlerEntry) -> None: """Dispatch one row. :param entry: Entry to be dispatched. :return: This method does not return anything. """ logger.info(f"Dispatching '{entry!s}'...") fn = self.get_callback(entry.callback) message = entry.data data, status, headers = await fn(message) if message.reply_topic is not None: await self.publisher.send( data, topic=message.reply_topic, identifier=message.identifier, status=status, user=message.user, headers=headers, ) @staticmethod def get_callback( fn: Callable[[BrokerRequest], Union[Optional[BrokerRequest], Awaitable[Optional[BrokerRequest]]]] ) -> Callable[[BrokerMessage], Awaitable[tuple[Any, BrokerMessageStatus, dict[str, str]]]]: """Get the handler function to be used by the Broker Handler. :param fn: The action function. :return: A wrapper function around the given one that is compatible with the Broker Handler API. """ @wraps(fn) async def _wrapper(raw: BrokerMessage) -> tuple[Any, BrokerMessageStatus, dict[str, str]]: request = BrokerRequest(raw) user_token = REQUEST_USER_CONTEXT_VAR.set(request.user) headers_token = REQUEST_HEADERS_CONTEXT_VAR.set(raw.headers) try: response = fn(request) if isawaitable(response): response = await response if isinstance(response, Response): response = await response.content() return response, BrokerMessageStatus.SUCCESS, REQUEST_HEADERS_CONTEXT_VAR.get() except ResponseException as exc: logger.warning(f"Raised an application exception: {exc!s}") return repr(exc), BrokerMessageStatus.ERROR, REQUEST_HEADERS_CONTEXT_VAR.get() except Exception as exc: logger.exception(f"Raised a system exception: {exc!r}") return repr(exc), BrokerMessageStatus.SYSTEM_ERROR, REQUEST_HEADERS_CONTEXT_VAR.get() finally: REQUEST_USER_CONTEXT_VAR.reset(user_token) REQUEST_HEADERS_CONTEXT_VAR.reset(headers_token) return _wrapper def get_action(self, topic: str) -> Optional[Callable]: """Get handling function to be called. Gets the instance of the class and method to call. Args: topic: Kafka topic. Example: "TicketAdded" Raises: MinosNetworkException: topic TicketAdded have no controller/action configured, please review th configuration file. """ if topic not in self._handlers: raise MinosActionNotFoundException( f"topic {topic} have no controller/action configured, " f"please review th configuration file" ) handler = self._handlers[topic] logger.debug(f"Loaded {handler!r} action!") return handler @cached_property def _queries(self) -> dict[str, str]: # noinspection PyTypeChecker return { "count_not_processed": _COUNT_NOT_PROCESSED_QUERY, "select_not_processed": _SELECT_NOT_PROCESSED_QUERY, "mark_processing": _MARK_PROCESSING_QUERY, "delete_processed": _DELETE_PROCESSED_QUERY, "update_not_processed": _UPDATE_NOT_PROCESSED_QUERY, }
class AsyncProxyBroker: def __init__(self, check_url, allowed_anonymity_levels=None, qps_per_proxy=1, max_consecutive_failures=5, providers=PROVIDERS, timeout=5): self._proxies = Queue() self._pending_providers = Queue() self._providers = providers self._verified_proxies = {} self._throttled_proxies = PriorityQueue() self._errors = {} self._check_url = check_url self._qps_per_proxy = qps_per_proxy self._max_consecutive_failures = max_consecutive_failures self._timeout = timeout self._ip = None self._ip_lock = Lock() if not allowed_anonymity_levels: self._allowed_anonymity_levels = ['Anonymous', 'Elite'] else: self._allowed_anonymity_levels = allowed_anonymity_levels async def _get_real_ip(self): while not self._ip: async with self._ip_lock: if self._ip: return self._ip try: async with aiohttp.request( url=random.choice(IP_HOSTS), method='GET', timeout=aiohttp.ClientTimeout( total=self._timeout)) as response: contents = await response.text() ips = get_all_ip(contents) if len(ips) == 1: self._ip = ips.pop() return self._ip except (UnicodeDecodeError, asyncio.TimeoutError, aiohttp.ClientOSError, aiohttp.ClientResponseError, aiohttp.ServerDisconnectedError): pass return self._ip async def _get_anonymity_level(self, proxy_address): judge = random.choice(JUDGES) ip = await self._get_real_ip() try: async with aiohttp.request(url=judge, method='GET', proxy=proxy_address, timeout=aiohttp.ClientTimeout( total=self._timeout)) as response: contents = (await response.text()).lower() contained_ips = get_all_ip(contents) if ip in contained_ips: return 'Transparent' elif 'via' in contents or 'proxy' in contents: return 'Anonymous' else: return 'Elite' except (UnicodeDecodeError, asyncio.TimeoutError, aiohttp.ClientOSError, aiohttp.ClientResponseError, aiohttp.ServerDisconnectedError): return 'None' def _populate_providers(self): for provider in self._providers: self._pending_providers.put_nowait(provider) async def _can_connect_to_test_url(self, proxy_address): try: async with aiohttp.request(url=self._check_url, method='GET', proxy=proxy_address, timeout=aiohttp.ClientTimeout( total=self._timeout)) as response: await response.text() return True except (UnicodeDecodeError, asyncio.TimeoutError, aiohttp.ClientOSError, aiohttp.ClientResponseError, aiohttp.ServerDisconnectedError): return False async def _populate_proxies(self): if self._pending_providers.empty(): self._populate_providers() provider = self._pending_providers.get_nowait() proxies = await provider.get_proxies() for proxy in proxies: self._proxies.put_nowait(proxy) self._pending_providers.task_done() async def _try_verify_one_proxy(self): if self._proxies.empty(): await self._populate_proxies() return (host, port, types) = self._proxies.get_nowait() proxy_address = 'http://%s:%s' % (host, port) if await self._get_anonymity_level(proxy_address) in self._allowed_anonymity_levels and \ await self._can_connect_to_test_url(proxy_address): self._verified_proxies[proxy_address] = deque() self._errors[proxy_address] = 0 self._proxies.task_done() @staticmethod def _flush_history(history): executions_removed = 0 earliest_time = time.monotonic() while len(history) > 0: earliest_time = history.popleft() if time.monotonic() - earliest_time < 1: history.appendleft(earliest_time) break executions_removed += 1 return executions_removed, earliest_time def _flush_throttled_proxies(self): while not self._throttled_proxies.empty(): (_, proxy_url, history) = self._throttled_proxies.get_nowait() executions_removed, earliest_time = self._flush_history(history) if executions_removed == 0: self._throttled_proxies.put_nowait( (earliest_time, proxy_url, history)) self._throttled_proxies.task_done() return self._verified_proxies[proxy_url] = history self._throttled_proxies.task_done() def mark_successful(self, proxy_url): if proxy_url not in self._errors: return self._errors[proxy_url] = max(0, self._errors[proxy_url] - 1) def mark_failure(self, proxy_url): if proxy_url not in self._errors: return self._errors[proxy_url] += 1 async def random_proxy(self): while True: self._flush_throttled_proxies() if not self._verified_proxies: await self._try_verify_one_proxy() while self._verified_proxies: proxy_url = random.choice(list(self._verified_proxies.keys())) if self._errors[proxy_url] >= self._max_consecutive_failures: del self._errors[proxy_url] del self._verified_proxies[proxy_url] continue history = self._verified_proxies[proxy_url] _, earliest_time = self._flush_history(history) if len(history) < self._qps_per_proxy: history.append(time.monotonic()) return proxy_url del self._verified_proxies[proxy_url] self._throttled_proxies.put_nowait( (earliest_time, proxy_url, history))