Example #1
0
 async def getmany(self, active_partitions: Set[TP],
                   timeout: float) -> RecordMap:
     # Implementation for the Fetcher service.
     _consumer = self._ensure_consumer()
     fetcher = _consumer._fetcher
     if _consumer._closed or fetcher._closed:
         raise ConsumerStoppedError()
     return await self.call_thread(fetcher.fetched_records,
                                   active_partitions,
                                   timeout=timeout,
                                   max_records=_consumer._max_poll_records)
Example #2
0
    def getmany(self, *partitions, timeout_ms=0, max_records=None):
        """Get messages from assigned topics / partitions.

        Prefetched messages are returned in batches by topic-partition.
        If messages is not available in the prefetched buffer this method waits
        `timeout_ms` milliseconds.

        Arguments:
            partitions (List[TopicPartition]): The partitions that need
                fetching message. If no one partition specified then all
                subscribed partitions will be used
            timeout_ms (int, optional): milliseconds spent waiting if
                data is not available in the buffer. If 0, returns immediately
                with any records that are available currently in the buffer,
                else returns empty. Must not be negative. Default: 0
        Returns:
            dict: topic to list of records since the last fetch for the
                subscribed list of topics and partitions

        Example usage:


        .. code:: python

            data = await consumer.getmany()
            for tp, messages in data.items():
                topic = tp.topic
                partition = tp.partition
                for message in messages:
                    # Process message
                    print(message.offset, message.key, message.value)

        """
        assert all(map(lambda k: isinstance(k, TopicPartition), partitions))
        if self._closed:
            raise ConsumerStoppedError()

        if max_records is not None and (
                not isinstance(max_records, int) or max_records < 1):
            raise ValueError("`max_records` must be a positive Integer")

        timeout = timeout_ms / 1000
        records = yield from self._wait_for_data_or_error(
            self._fetcher.fetched_records(
                partitions, timeout,
                max_records=max_records or self._max_poll_records),
            shield=False
        )
        return records
Example #3
0
 async def _fetch_records(self,
                          consumer: aiokafka.AIOKafkaConsumer,
                          active_partitions: Set[TP],
                          timeout: float = None,
                          max_records: int = None) -> RecordMap:
     if not self.consumer.flow_active:
         return {}
     fetcher = consumer._fetcher
     if consumer._closed or fetcher._closed:
         raise ConsumerStoppedError()
     with fetcher._subscriptions.fetch_context():
         return await fetcher.fetched_records(
             active_partitions,
             timeout=timeout,
             max_records=max_records,
         )
Example #4
0
    def close(self):
        self._fetch_task.cancel()
        try:
            yield from self._fetch_task
        except asyncio.CancelledError:
            pass

        # Fail all pending fetchone/fetchall calls
        if self._wait_empty_future is not None and \
                not self._wait_empty_future.done():
            self._wait_empty_future.set_exception(ConsumerStoppedError())

        for x in self._fetch_tasks:
            x.cancel()
            try:
                yield from x
            except asyncio.CancelledError:
                pass
Example #5
0
    def next_record(self, partitions):
        """ Return one fetched records

        This method will contain a little overhead as we will do more work this
        way:
            * Notify prefetch routine per every consumed partition
            * Assure message marked for autocommit

        """
        while True:
            if self._closed:
                raise ConsumerStoppedError()

            # While the background routine will fetch new records up till new
            # assignment is finished, we don't want to return records, that may
            # not belong to this instance after rebalance.
            if self._subscriptions.reassignment_in_progress:
                yield from self._subscriptions.wait_for_assignment()

            for tp in list(self._records.keys()):
                if partitions and tp not in partitions:
                    # Cleanup results for unassigned partitons
                    if not self._subscriptions.is_assigned(tp):
                        del self._records[tp]
                    continue
                res_or_error = self._records[tp]
                if type(res_or_error) == FetchResult:
                    message = res_or_error.getone()
                    if message is None:
                        # We already processed all messages, request new ones
                        del self._records[tp]
                        self._notify(self._wait_consume_future)
                    else:
                        return message
                else:
                    # Remove error, so we can fetch on partition again
                    del self._records[tp]
                    self._notify(self._wait_consume_future)
                    res_or_error.check_raise()

            # No messages ready. Wait for some to arrive
            waiter = self._create_fetch_waiter()
            yield from waiter
Example #6
0
    def getone(self, *partitions):
        """
        Get one message from Kafka.
        If no new messages prefetched, this method will wait for it.

        Arguments:
            partitions (List[TopicPartition]): Optional list of partitions to
                return from. If no partitions specified then returned message
                will be from any partition, which consumer is subscribed to.

        Returns:
            ConsumerRecord

        Will return instance of

        .. code:: python

            collections.namedtuple(
                "ConsumerRecord",
                ["topic", "partition", "offset", "key", "value"])

        Example usage:


        .. code:: python

            while True:
                message = await consumer.getone()
                topic = message.topic
                partition = message.partition
                # Process message
                print(message.offset, message.key, message.value)

        """
        assert all(map(lambda k: isinstance(k, TopicPartition), partitions))
        if self._closed:
            raise ConsumerStoppedError()

        msg = yield from self._wait_for_data_or_error(
            self._fetcher.next_record(partitions), shield=False)
        return msg
Example #7
0
 def __aiter__(self):
     if self._closed:
         raise ConsumerStoppedError()
     return self
Example #8
0
    async def getmany(self,
                      timeout: float) -> AsyncIterator[Tuple[TP, Message]]:
        # Implementation for the Fetcher service.
        _consumer = self._consumer
        fetcher = _consumer._fetcher
        if _consumer._closed or fetcher._closed:
            raise ConsumerStoppedError()
        active_partitions = self._get_active_partitions()
        _next = next

        records: RecordMap = {}
        # This lock is acquired by pause_partitions/resume_partitions,
        # but those should never be called when the Fetcher is running.
        with self._partitions_lock:
            if active_partitions:
                # Fetch records only if active partitions to avoid the risk of
                # fetching all partitions in the beginning when none of the
                # partitions is paused/resumed.
                records = await fetcher.fetched_records(
                    active_partitions,
                    timeout=timeout,
                )
            else:
                # We should still release to the event loop
                await self.sleep(0)
        create_message = ConsumerMessage  # localize

        # records' contain mapping from TP to list of messages.
        # if there are two agents, consuming from topics t1 and t2,
        # normal order of iteration would be to process each
        # tp in the dict:
        #    for tp. messages in records.items():
        #        for message in messages:
        #           yield tp, message
        #
        # The problem with this, is if we have prefetched 16k records
        # for one partition, the other partitions won't even start processing
        # before those 16k records are completed.
        #
        # So we try round-robin between the tps instead:
        #
        #    iterators: Dict[TP, Iterator] = {
        #        tp: iter(messages)
        #        for tp, messages in records.items()
        #    }
        #    while iterators:
        #        for tp, messages in iterators.items():
        #            yield tp, next(messages)
        #            # remove from iterators if empty.
        #
        # The problem with this implementation is that
        # the records mapping is ordered by TP, so records.keys()
        # will look like this:
        #
        #  TP(topic='bar', partition=0)
        #  TP(topic='bar', partition=1)
        #  TP(topic='bar', partition=2)
        #  TP(topic='bar', partition=3)
        #  TP(topic='foo', partition=0)
        #  TP(topic='foo', partition=1)
        #  TP(topic='foo', partition=2)
        #  TP(topic='foo', partition=3)
        #
        # If there are 100 partitions for each topic,
        # it will process 100 items in the first topic, then 100 items
        # in the other topic, but even worse if partition counts
        # vary greatly, t1 has 1000 partitions and t2
        # has 1 partition, then t2 will end up being starved most of the time.
        #
        # We solve this by going round-robin through each topic.
        topic_index = self._records_to_topic_index(records, active_partitions)
        to_remove: Set[str] = set()
        sentinel = object()
        while topic_index:
            for topic in to_remove:
                topic_index.pop(topic, None)
            for topic, messages in topic_index.items():
                item = _next(messages, sentinel)
                if item is sentinel:
                    # this topic is now empty,
                    # but we cannot remove from dict while iterating over it,
                    # so move that to the outer loop.
                    to_remove.add(topic)
                    continue
                tp, record = item  # type: ignore
                highwater_mark = self._consumer.highwater(tp)
                self.app.monitor.track_tp_end_offset(tp, highwater_mark)
                yield tp, create_message(
                    record.topic,
                    record.partition,
                    record.offset,
                    record.timestamp / 1000.0,
                    record.timestamp_type,
                    record.key,
                    record.value,
                    record.checksum,
                    record.serialized_key_size,
                    record.serialized_value_size,
                    tp,
                )