Exemple #1
0
class UrlsManager:
    """
    内部记录每个url的运行信息
    并保存最终运行结果
    """
    class _counter:
        """
        内部计数的自定义类,
        维护一个namedtuple[Counter]
        """
        def __new__(cls, fields):
            cls.fields = fields._fields_defaults
            return super().__new__(cls)

        def __init__(self, fields: Counter):
            # _dict用于实际保存并计数
            f = fields._fields
            self._dict = dict(zip(f, [0 for i in range(len(f))]))

        def __setattr__(self, name, value):
            """所有的赋值操作都会调用"""
            # 阻止对_dict的直接赋值
            if (name == '_dict' and hasattr(self, '_dict')
                    and isinstance(getattr(self, '_dict'), dict)):
                raise ValueError(f' Forbidden to modify attribute:[{name}]')
            if name == '_dict':  # 本实现将阻止除了更新计数之外的其它设值及增加属性,模拟了namedtuple抛出异常
                super().__setattr__(name, value)
            elif name in self._dict:
                self._dict[name] = value
            else:
                raise ValueError(f' Got unexpected field names:[{name}]')

        def __getattribute__(self, name):
            """
            __getattribute__在任何属性查找操作中都会调用(包含特殊属性),所以注意以下要super调用
            否则会陷入无限递归调用.
            __getattr__方法则是在本身及其类上查找不到才会调用
            """
            # 注意特殊属性及类自身属性.实际应用时应注意
            if name in super().__getattribute__('_dict'):
                return super().__getattribute__('_dict')[name]
            else:
                return super().__getattribute__(name)

        def __delattr__(self, name):
            """拦截了所有删除操作"""
            raise ValueError(f' Forbidden to delete attribute:[{name}]')

        def add(self, n: Counter):
            """
            使用数值累加计数器
            """
            for key in n._fields:
                v = getattr(n, key)
                if v:
                    self._dict[key] += v

        def update(self, n: Counter):
            v = None
            for key in n._fields:
                v = getattr(n, key)
                if v:
                    self._dict[key] = v

        @property
        def values(self):
            return Counter(**self._dict)

    def __init__(self, max_tried_times: int = 3):
        """
        参数:max_tried_times 为任务失败前最大尝试次数,需为大于0的整数
        """
        self.max_tried_times = max_tried_times
        # 以下Queue必须在主事件循环event loop确定后再建立,
        self.__queue_todo = None  # 保存待完成的任务
        # 其构造函数为Queue(maxsize=0, *, loop=None),不指定loop时
        # 内部获取的event loop可能与主event loop不同,则join方法会产生runtime error
        self.__total_key_urls = defaultdict(
            int)  # 用于过滤url并保存处理次数,键为url,值为已处理次数
        self.__done_urls = set()  # 用于保存完成记录
        self.__failed_urls = set()  # 用于保存每次失败记录
        self.__working_urls = set()  # 用于保存正在处理的url
        self.__discarded_urls = set()  # 保存丢弃的url及丢弃次数
        self.__counter = self._counter(Counter)  # 维护内部处理计数器

    async def prepare(self) -> bool:
        """需要控制时间的初始化或资源准备"""
        self.__queue_todo = PriorityQueue()
        return True

    async def put(self, url: str or URL) -> Counter:
        """
        向UlrsManager发送1条url
        参数:url为字符串
        返回:Counter 指示此次添加的url处理方式
        此方法在内部对url有两种处理方式:put进任务队列或discard
        """
        url = URL(url)
        put = discarded = 0  # 传递给计数器_counter
        times = self.__total_key_urls[url]
        _todo = {url: rank
                 for rank, url in self.__queue_todo._queue}  # 将优先队列中数据转换成字典形式
        # 如果达到尝试次数或者已在任务中则丢弃它
        if (times >= self.max_tried_times or url in itertools.chain(
                _todo, self.__working_urls, self.__done_urls)):
            self.__discarded_urls.add(url)
            discarded = 1
        else:
            self.__queue_todo.put_nowait((times, url))
            put = 1
        c = Counter(count=1, put=put, discarded=discarded)
        self.__counter.add(c)  # 更新计数器
        return c

    async def put_urls(self, urls: Iterable) -> Counter:
        """
        向UlrsManager发送多条url
        参数:urls:为列表,元组或集合
        返回:urls的处理情况计数
        """
        c = self._counter(Counter)
        for url in urls:
            c.add(await self.put(url))
        return c

    async def task_done(self, url: str or URL, is_OK=True) -> Counter:
        """
        通知UrlsManager本url的任务处理已经结束
        参数:is_OK 标识处理是否正常完成
        此方法在内部对url做完成或失败处理,并从working池中移除
        """
        self.__working_urls.remove(url)
        self.__total_key_urls[url] = self.__total_key_urls[url] + 1
        if is_OK:
            self.__done_urls.add(url)
            c = Counter(done=1)
            self.__counter.add(c)
            self.__queue_todo.task_done()
            return c
        else:
            times = self.__total_key_urls[url]
            if times >= self.max_tried_times:
                self.__failed_urls.add(url)
                c = Counter(failed=1)
                self.__counter.add(c)
            else:
                c = await self.put(url)
            self.__queue_todo.task_done()
            return c

    async def get(self) -> str:
        """从UrlsManager中获取1个url"""
        urlItem = await self.__queue_todo.get()  # 优先队列中存储的为tuple
        self.__working_urls.add(urlItem[1])
        return urlItem[1]

    async def get_urls(self, qty: int) -> tuple:
        """取得多个url"""

    async def join(self) -> bool:
        """阻塞执行线程直到UrlsManager中的url全部被取出并处理完 """
        await self.__queue_todo.join()
        return True

    async def get_todo(self):
        """得到待处理的url元组"""
        _todo = self.__queue_todo._queue
        _todo = (url for v, url in _todo)
        return tuple(_todo)

    async def get_results(self):
        #count put discarded done failed todo working
        _todo = self.__queue_todo._queue
        _todo = (url for v, url in _todo)
        _todo_urls = set(_todo)
        results = namedtuple(
            'results',
            'key_urls discarded_urls done_urls failed_urls todo_urls working_urls'
        )
        results = results(self.__total_key_urls, self.__discarded_urls,
                          self.__done_urls, self.__failed_urls, _todo_urls,
                          self.__working_urls)
        return results

    async def get_count(self) -> Counter:
        todo = len(self.__queue_todo._queue)
        working = len(self.__working_urls)
        ikeys = len(self.__total_key_urls)
        isum = sum(self.__total_key_urls.values())
        imin = min(self.__total_key_urls.values())
        imax = max(self.__total_key_urls.values())
        c = Counter(todo=todo,
                    working=working,
                    keys=ikeys,
                    sum=isum,
                    min=imin,
                    max=imax)
        self.__counter.update(c)
        return self.__counter.values
Exemple #2
0
class BrokerHandler(BrokerHandlerSetup):
    """Broker Handler class."""

    __slots__ = "_handlers", "_records", "_retry", "_queue", "_consumers", "_consumer_concurrency"

    def __init__(
        self,
        records: int,
        handlers: dict[str, Optional[Callable]],
        retry: int,
        publisher: BrokerPublisher,
        consumer_concurrency: int = 15,
        **kwargs: Any,
    ):
        super().__init__(**kwargs)
        self._handlers = handlers
        self._records = records
        self._retry = retry

        self._queue = PriorityQueue(maxsize=self._records)
        self._consumers: list[Task] = list()
        self._consumer_concurrency = consumer_concurrency

        self._publisher = publisher

    @classmethod
    def _from_config(cls, config: MinosConfig, **kwargs) -> BrokerHandler:
        kwargs["handlers"] = cls._get_handlers(config, **kwargs)
        kwargs["publisher"] = cls._get_publisher(**kwargs)
        # noinspection PyProtectedMember
        return cls(**config.broker.queue._asdict(), **kwargs)

    @staticmethod
    def _get_handlers(
        config: MinosConfig, handlers: dict[str, Optional[Callable]] = None, **kwargs
    ) -> dict[str, Callable[[BrokerRequest], Awaitable[Optional[BrokerResponse]]]]:
        if handlers is None:
            builder = EnrouteBuilder(*config.services, middleware=config.middleware)
            decorators = builder.get_broker_command_query_event(config=config, **kwargs)
            handlers = {decorator.topic: fn for decorator, fn in decorators.items()}
        return handlers

    # noinspection PyUnusedLocal
    @staticmethod
    @inject
    def _get_publisher(
        publisher: Optional[BrokerPublisher] = None,
        broker_publisher: BrokerPublisher = Provide["broker_publisher"],
        **kwargs,
    ) -> BrokerPublisher:
        if publisher is None:
            publisher = broker_publisher
        if publisher is None or isinstance(publisher, Provide):
            raise NotProvidedException(f"A {BrokerPublisher!r} object must be provided.")
        return publisher

    async def _setup(self) -> None:
        await super()._setup()
        await self._create_consumers()

    async def _destroy(self) -> None:
        await self._destroy_consumers()
        await super()._destroy()

    async def _create_consumers(self):
        while len(self._consumers) < self._consumer_concurrency:
            self._consumers.append(create_task(self._consume()))

    async def _destroy_consumers(self):
        for consumer in self._consumers:
            consumer.cancel()
        await gather(*self._consumers, return_exceptions=True)
        self._consumers = list()

        while not self._queue.empty():
            entry = self._queue.get_nowait()
            await self.submit_query(self._queries["update_not_processed"], (entry.id,))

    async def _consume(self) -> None:
        while True:
            await self._consume_one()

    async def _consume_one(self) -> None:
        entry = await self._queue.get()
        try:
            await self._dispatch_one(entry)
        finally:
            self._queue.task_done()

    @property
    def publisher(self) -> BrokerPublisher:
        """Get the publisher instance.

        :return: A ``BrokerPublisher`` instance.
        """
        return self._publisher

    @property
    def consumers(self) -> list[Task]:
        """Get the consumers.

        :return: A list of ``Task`` instances.
        """
        return self._consumers

    @property
    def handlers(self) -> dict[str, Optional[Callable]]:
        """Handlers getter.

        :return: A dictionary in which the keys are topics and the values are the handler.
        """
        return self._handlers

    @property
    def topics(self) -> KeysView[str]:
        """Get an iterable containing the topic names.

        :return: An ``Iterable`` of ``str``.
        """
        return self.handlers.keys()

    async def dispatch_forever(self, max_wait: Optional[float] = 60.0) -> NoReturn:
        """Dispatch the items in the consuming queue forever.

        :param max_wait: Maximum seconds to wait for notifications. If ``None`` the wait is performed until infinity.
        :return: This method does not return anything.
        """
        async with self.cursor() as cursor:
            await self._listen_entries(cursor)
            try:
                while True:
                    await self._wait_for_entries(cursor, max_wait)
                    await self.dispatch(cursor, background_mode=True)
            finally:
                await self._unlisten_entries(cursor)

    async def _listen_entries(self, cursor: Cursor):
        for topic in self.topics:
            # noinspection PyTypeChecker
            await cursor.execute(_LISTEN_QUERY.format(Identifier(topic)))

    async def _unlisten_entries(self, cursor: Cursor) -> None:
        for topic in self.topics:
            # noinspection PyTypeChecker
            await cursor.execute(_UNLISTEN_QUERY.format(Identifier(topic)))

    async def _wait_for_entries(self, cursor: Cursor, max_wait: Optional[float]) -> None:
        if await self._get_count(cursor):
            return

        while True:
            try:
                return await wait_for(consume_queue(cursor.connection.notifies, self._records), max_wait)
            except TimeoutError:
                if await self._get_count(cursor):
                    return

    async def _get_count(self, cursor) -> int:
        if not len(self.topics):
            return 0
        await cursor.execute(_COUNT_NOT_PROCESSED_QUERY, (self._retry, tuple(self.topics)))
        count = (await cursor.fetchone())[0]
        return count

    async def dispatch(self, cursor: Optional[Cursor] = None, background_mode: bool = False) -> None:
        """Dispatch a batch of ``HandlerEntry`` instances from the database's queue.

        :param cursor: The cursor to interact with the database. If ``None`` is provided a new one is acquired.
        :param background_mode: If ``True`` the entries dispatching waits until every entry is processed. Otherwise,
            the dispatching is performed on background.
        :return: This method does not return anything.
        """

        is_external_cursor = cursor is not None
        if not is_external_cursor:
            cursor = await self.cursor().__aenter__()

        async with cursor.begin():
            await cursor.execute(
                self._queries["select_not_processed"], (self._retry, tuple(self.topics), self._records)
            )
            result = await cursor.fetchall()

            if len(result):
                entries = self._build_entries(result)

                await cursor.execute(self._queries["mark_processing"], (tuple(e.id for e in entries),))

                for entry in entries:
                    await self._queue.put(entry)

        if not is_external_cursor:
            await cursor.__aexit__(None, None, None)

        if not background_mode:
            await self._queue.join()

    def _build_entries(self, rows: list[tuple]) -> list[BrokerHandlerEntry]:
        kwargs = {"callback_lookup": self.get_action}
        return [BrokerHandlerEntry(*row, **kwargs) for row in rows]

    async def _dispatch_one(self, entry: BrokerHandlerEntry) -> None:
        logger.debug(f"Dispatching '{entry!r}'...")
        try:
            await self.dispatch_one(entry)
        except (CancelledError, Exception) as exc:
            logger.warning(f"Raised an exception while dispatching {entry!r}: {exc!r}")
            entry.exception = exc
            if isinstance(exc, CancelledError):
                raise exc
        finally:
            query_id = "delete_processed" if entry.success else "update_not_processed"
            await self.submit_query(self._queries[query_id], (entry.id,))

    async def dispatch_one(self, entry: BrokerHandlerEntry) -> None:
        """Dispatch one row.

        :param entry: Entry to be dispatched.
        :return: This method does not return anything.
        """
        logger.info(f"Dispatching '{entry!s}'...")

        fn = self.get_callback(entry.callback)
        message = entry.data
        data, status, headers = await fn(message)

        if message.reply_topic is not None:
            await self.publisher.send(
                data,
                topic=message.reply_topic,
                identifier=message.identifier,
                status=status,
                user=message.user,
                headers=headers,
            )

    @staticmethod
    def get_callback(
        fn: Callable[[BrokerRequest], Union[Optional[BrokerRequest], Awaitable[Optional[BrokerRequest]]]]
    ) -> Callable[[BrokerMessage], Awaitable[tuple[Any, BrokerMessageStatus, dict[str, str]]]]:
        """Get the handler function to be used by the Broker Handler.

        :param fn: The action function.
        :return: A wrapper function around the given one that is compatible with the Broker Handler API.
        """

        @wraps(fn)
        async def _wrapper(raw: BrokerMessage) -> tuple[Any, BrokerMessageStatus, dict[str, str]]:
            request = BrokerRequest(raw)
            user_token = REQUEST_USER_CONTEXT_VAR.set(request.user)
            headers_token = REQUEST_HEADERS_CONTEXT_VAR.set(raw.headers)

            try:
                response = fn(request)
                if isawaitable(response):
                    response = await response
                if isinstance(response, Response):
                    response = await response.content()
                return response, BrokerMessageStatus.SUCCESS, REQUEST_HEADERS_CONTEXT_VAR.get()
            except ResponseException as exc:
                logger.warning(f"Raised an application exception: {exc!s}")
                return repr(exc), BrokerMessageStatus.ERROR, REQUEST_HEADERS_CONTEXT_VAR.get()
            except Exception as exc:
                logger.exception(f"Raised a system exception: {exc!r}")
                return repr(exc), BrokerMessageStatus.SYSTEM_ERROR, REQUEST_HEADERS_CONTEXT_VAR.get()
            finally:
                REQUEST_USER_CONTEXT_VAR.reset(user_token)
                REQUEST_HEADERS_CONTEXT_VAR.reset(headers_token)

        return _wrapper

    def get_action(self, topic: str) -> Optional[Callable]:
        """Get handling function to be called.

        Gets the instance of the class and method to call.

        Args:
            topic: Kafka topic. Example: "TicketAdded"

        Raises:
            MinosNetworkException: topic TicketAdded have no controller/action configured, please review th
                configuration file.
        """
        if topic not in self._handlers:
            raise MinosActionNotFoundException(
                f"topic {topic} have no controller/action configured, " f"please review th configuration file"
            )

        handler = self._handlers[topic]

        logger.debug(f"Loaded {handler!r} action!")
        return handler

    @cached_property
    def _queries(self) -> dict[str, str]:
        # noinspection PyTypeChecker
        return {
            "count_not_processed": _COUNT_NOT_PROCESSED_QUERY,
            "select_not_processed": _SELECT_NOT_PROCESSED_QUERY,
            "mark_processing": _MARK_PROCESSING_QUERY,
            "delete_processed": _DELETE_PROCESSED_QUERY,
            "update_not_processed": _UPDATE_NOT_PROCESSED_QUERY,
        }
Exemple #3
0
class AsyncProxyBroker:
    def __init__(self,
                 check_url,
                 allowed_anonymity_levels=None,
                 qps_per_proxy=1,
                 max_consecutive_failures=5,
                 providers=PROVIDERS,
                 timeout=5):
        self._proxies = Queue()
        self._pending_providers = Queue()
        self._providers = providers

        self._verified_proxies = {}
        self._throttled_proxies = PriorityQueue()
        self._errors = {}

        self._check_url = check_url
        self._qps_per_proxy = qps_per_proxy
        self._max_consecutive_failures = max_consecutive_failures
        self._timeout = timeout

        self._ip = None
        self._ip_lock = Lock()

        if not allowed_anonymity_levels:
            self._allowed_anonymity_levels = ['Anonymous', 'Elite']
        else:
            self._allowed_anonymity_levels = allowed_anonymity_levels

    async def _get_real_ip(self):
        while not self._ip:
            async with self._ip_lock:
                if self._ip:
                    return self._ip

                try:
                    async with aiohttp.request(
                            url=random.choice(IP_HOSTS),
                            method='GET',
                            timeout=aiohttp.ClientTimeout(
                                total=self._timeout)) as response:
                        contents = await response.text()
                        ips = get_all_ip(contents)

                        if len(ips) == 1:
                            self._ip = ips.pop()
                            return self._ip
                except (UnicodeDecodeError, asyncio.TimeoutError,
                        aiohttp.ClientOSError, aiohttp.ClientResponseError,
                        aiohttp.ServerDisconnectedError):
                    pass

        return self._ip

    async def _get_anonymity_level(self, proxy_address):
        judge = random.choice(JUDGES)
        ip = await self._get_real_ip()

        try:
            async with aiohttp.request(url=judge,
                                       method='GET',
                                       proxy=proxy_address,
                                       timeout=aiohttp.ClientTimeout(
                                           total=self._timeout)) as response:
                contents = (await response.text()).lower()
                contained_ips = get_all_ip(contents)

                if ip in contained_ips:
                    return 'Transparent'
                elif 'via' in contents or 'proxy' in contents:
                    return 'Anonymous'
                else:
                    return 'Elite'
        except (UnicodeDecodeError, asyncio.TimeoutError,
                aiohttp.ClientOSError, aiohttp.ClientResponseError,
                aiohttp.ServerDisconnectedError):
            return 'None'

    def _populate_providers(self):
        for provider in self._providers:
            self._pending_providers.put_nowait(provider)

    async def _can_connect_to_test_url(self, proxy_address):
        try:
            async with aiohttp.request(url=self._check_url,
                                       method='GET',
                                       proxy=proxy_address,
                                       timeout=aiohttp.ClientTimeout(
                                           total=self._timeout)) as response:
                await response.text()
                return True
        except (UnicodeDecodeError, asyncio.TimeoutError,
                aiohttp.ClientOSError, aiohttp.ClientResponseError,
                aiohttp.ServerDisconnectedError):
            return False

    async def _populate_proxies(self):
        if self._pending_providers.empty():
            self._populate_providers()

        provider = self._pending_providers.get_nowait()
        proxies = await provider.get_proxies()

        for proxy in proxies:
            self._proxies.put_nowait(proxy)

        self._pending_providers.task_done()

    async def _try_verify_one_proxy(self):
        if self._proxies.empty():
            await self._populate_proxies()
            return

        (host, port, types) = self._proxies.get_nowait()
        proxy_address = 'http://%s:%s' % (host, port)

        if await self._get_anonymity_level(proxy_address) in self._allowed_anonymity_levels and \
                await self._can_connect_to_test_url(proxy_address):
            self._verified_proxies[proxy_address] = deque()
            self._errors[proxy_address] = 0

        self._proxies.task_done()

    @staticmethod
    def _flush_history(history):
        executions_removed = 0
        earliest_time = time.monotonic()

        while len(history) > 0:
            earliest_time = history.popleft()
            if time.monotonic() - earliest_time < 1:
                history.appendleft(earliest_time)
                break
            executions_removed += 1

        return executions_removed, earliest_time

    def _flush_throttled_proxies(self):
        while not self._throttled_proxies.empty():
            (_, proxy_url, history) = self._throttled_proxies.get_nowait()
            executions_removed, earliest_time = self._flush_history(history)

            if executions_removed == 0:
                self._throttled_proxies.put_nowait(
                    (earliest_time, proxy_url, history))
                self._throttled_proxies.task_done()
                return

            self._verified_proxies[proxy_url] = history
            self._throttled_proxies.task_done()

    def mark_successful(self, proxy_url):
        if proxy_url not in self._errors:
            return

        self._errors[proxy_url] = max(0, self._errors[proxy_url] - 1)

    def mark_failure(self, proxy_url):
        if proxy_url not in self._errors:
            return

        self._errors[proxy_url] += 1

    async def random_proxy(self):
        while True:
            self._flush_throttled_proxies()

            if not self._verified_proxies:
                await self._try_verify_one_proxy()

            while self._verified_proxies:
                proxy_url = random.choice(list(self._verified_proxies.keys()))

                if self._errors[proxy_url] >= self._max_consecutive_failures:
                    del self._errors[proxy_url]
                    del self._verified_proxies[proxy_url]
                    continue

                history = self._verified_proxies[proxy_url]

                _, earliest_time = self._flush_history(history)
                if len(history) < self._qps_per_proxy:
                    history.append(time.monotonic())
                    return proxy_url

                del self._verified_proxies[proxy_url]
                self._throttled_proxies.put_nowait(
                    (earliest_time, proxy_url, history))