Beispiel #1
0
 def _on_change_subscription(self):
     """This is `group rebalanced` signal handler for update fetch positions
     of assigned partitions"""
     # fetch positions if we have partitions we're subscribed
     # to that we don't know the offset for
     if not self._subscription.has_all_fetch_positions():
         ensure_future(self._update_fetch_positions(
             self._subscription.missing_fetch_positions()),
             loop=self._loop)
Beispiel #2
0
 def _on_change_subscription(self):
     """This is `group rebalanced` signal handler for update fetch positions
     of assigned partitions"""
     # fetch positions if we have partitions we're subscribed
     # to that we don't know the offset for
     if not self._subscription.has_all_fetch_positions():
         ensure_future(self._update_fetch_positions(
             self._subscription.missing_fetch_positions()),
                       loop=self._loop)
Beispiel #3
0
 def connect(self):
     loop = self._loop
     self._closed_fut = asyncio.Future(loop=loop)
     if self._secutity_protocol == "PLAINTEXT":
         ssl = None
     else:
         assert self._secutity_protocol == "SSL"
         assert self._ssl_context is not None
         ssl = self._ssl_context
     # Create streams same as `open_connection`, but using custom protocol
     reader = asyncio.StreamReader(limit=READER_LIMIT, loop=loop)
     protocol = AIOKafkaProtocol(self._closed_fut, reader, loop=loop)
     transport, _ = yield from asyncio.wait_for(
         loop.create_connection(lambda: protocol,
                                self.host,
                                self.port,
                                ssl=ssl),
         loop=loop,
         timeout=self._request_timeout)
     writer = asyncio.StreamWriter(transport, protocol, reader, loop)
     self._reader, self._writer, self._protocol = reader, writer, protocol
     # Start reader task.
     self._read_task = ensure_future(self._read(), loop=loop)
     # Start idle checker
     if self._max_idle_ms is not None:
         self._idle_handle = self._loop.call_soon(self._idle_check)
     return reader, writer
Beispiel #4
0
    def bootstrap(self):
        """Try to to bootstrap initial cluster metadata"""
        # using request v0 for bootstap (bcs api version is not detected yet)
        metadata_request = MetadataRequest[0]([])
        for host, port, _ in self.hosts:
            log.debug("Attempting to bootstrap via node at %s:%s", host, port)

            try:
                bootstrap_conn = yield from create_conn(
                    host,
                    port,
                    loop=self._loop,
                    client_id=self._client_id,
                    request_timeout_ms=self._request_timeout_ms,
                    ssl_context=self._ssl_context,
                    security_protocol=self._security_protocol,
                    max_idle_ms=self._connections_max_idle_ms)
            except (OSError, asyncio.TimeoutError) as err:
                log.error('Unable connect to "%s:%s": %s', host, port, err)
                continue

            try:
                metadata = yield from bootstrap_conn.send(metadata_request)
            except KafkaError as err:
                log.warning('Unable to request metadata from "%s:%s": %s',
                            host, port, err)
                bootstrap_conn.close()
                continue

            self.cluster.update_metadata(metadata)

            # A cluster with no topics can return no broker metadata...
            # In that case, we should keep the bootstrap connection till
            # we get a normal cluster layout.
            if not len(self.cluster.brokers()):
                self._conns['bootstrap'] = bootstrap_conn
            else:
                bootstrap_conn.close()

            log.debug('Received cluster metadata: %s', self.cluster)
            break
        else:
            raise ConnectionError('Unable to bootstrap from {}'.format(
                self.hosts))

        # detect api version if need
        if self._api_version == 'auto':
            self._api_version = yield from self.check_version()
        if type(self._api_version) is not tuple:
            self._api_version = tuple(map(int, self._api_version.split('.')))

        if self._sync_task is None:
            # starting metadata synchronizer task
            self._sync_task = ensure_future(self._md_synchronizer(),
                                            loop=self._loop)
Beispiel #5
0
    def start(self):
        """Connect to Kafka cluster and check server version"""
        log.debug("Starting the Kafka producer")  # trace
        yield from self.client.bootstrap()

        if self._compression_type == 'lz4':
            assert self.client.api_version >= (0, 8, 2), \
                'LZ4 Requires >= Kafka 0.8.2 Brokers'

        self._sender_task = ensure_future(self._sender_routine(),
                                          loop=self._loop)
        self._message_accumulator.set_api_version(self.client.api_version)
        log.debug("Kafka producer started")
Beispiel #6
0
    def bootstrap(self):
        """Try to to bootstrap initial cluster metadata"""
        metadata_request = MetadataRequest([])
        for host, port, _ in self.hosts:
            log.debug("Attempting to bootstrap via node at %s:%s", host, port)

            try:
                bootstrap_conn = yield from create_conn(
                    host,
                    port,
                    loop=self._loop,
                    client_id=self._client_id,
                    request_timeout_ms=self._request_timeout_ms)
            except (OSError, asyncio.TimeoutError) as err:
                log.error('Unable connect to "%s:%s": %s', host, port, err)
                continue

            try:
                metadata = yield from bootstrap_conn.send(metadata_request)
            except KafkaError as err:
                log.warning('Unable to request metadata from "%s:%s": %s',
                            host, port, err)
                bootstrap_conn.close()
                continue

            self.cluster.update_metadata(metadata)

            # A cluster with no topics can return no broker metadata
            # in that case, we should keep the bootstrap connection
            if not len(self.cluster.brokers()):
                self._conns['bootstrap'] = bootstrap_conn
            else:
                bootstrap_conn.close()

            log.debug('Received cluster metadata: %s', self.cluster)
            break
        else:
            raise ConnectionError('Unable to bootstrap from {}'.format(
                self.hosts))

        if self._sync_task is None:
            # starting metadata synchronizer task
            self._sync_task = ensure_future(self._md_synchronizer(),
                                            loop=self._loop)
Beispiel #7
0
    def start(self):
        """Connect to Kafka cluster and check server version"""
        log.debug("Starting the Kafka producer")  # trace
        yield from self.client.bootstrap()

        # Check Broker Version if not set explicitly
        if self._api_version == 'auto':
            self._api_version = yield from self.client.check_version()

        # Convert api_version config to tuple for easy comparisons
        self._api_version = tuple(map(int, self._api_version.split('.')))

        if self._compression_type == 'lz4':
            assert self._api_version >= (0, 8, 2), \
                'LZ4 Requires >= Kafka 0.8.2 Brokers'

        self._sender_task = ensure_future(self._sender_routine(),
                                          loop=self._loop)
        log.debug("Kafka producer started")
Beispiel #8
0
    def start(self):
        """Connect to Kafka cluster and check server version"""
        log.debug("Starting the Kafka producer")  # trace
        yield from self.client.bootstrap()

        # Check Broker Version if not set explicitly
        if self._api_version == 'auto':
            self._api_version = yield from self.client.check_version()

        # Convert api_version config to tuple for easy comparisons
        self._api_version = tuple(
            map(int, self._api_version.split('.')))

        if self._compression_type == 'lz4':
            assert self._api_version >= (0, 8, 2), \
                'LZ4 Requires >= Kafka 0.8.2 Brokers'

        self._sender_task = ensure_future(
            self._sender_routine(), loop=self._loop)
        log.debug("Kafka producer started")
Beispiel #9
0
    def _sender_routine(self):
        """backgroud task that sends message batches to Kafka brokers"""
        tasks = set()
        try:
            while True:
                batches, unknown_leaders_exist = \
                    self._message_accumulator.drain_by_nodes(
                        ignore_nodes=self._in_flight)

                # create produce task for every batch
                for node_id, batches in batches.items():
                    task = ensure_future(
                        self._send_produce_req(node_id, batches),
                        loop=self._loop)
                    tasks.add(task)

                if unknown_leaders_exist:
                    # we have at least one unknown partition's leader,
                    # try to update cluster metadata and wait backoff time
                    self.client.force_metadata_update()
                    # Just to have at least 1 future in wait() call
                    fut = asyncio.sleep(self._retry_backoff, loop=self._loop)
                    waiters = tasks.union([fut])
                else:
                    fut = self._message_accumulator.data_waiter()
                    waiters = tasks.union([fut])

                # wait when:
                # * At least one of produce task is finished
                # * Data for new partition arrived
                # * Metadata update if partition leader unknown
                done, _ = yield from asyncio.wait(
                    waiters,
                    return_when=asyncio.FIRST_COMPLETED,
                    loop=self._loop)
                tasks -= done

        except asyncio.CancelledError:
            pass
        except Exception:  # pragma: no cover
            log.error("Unexpected error in sender routine", exc_info=True)
Beispiel #10
0
    def bootstrap(self):
        """Try to to bootstrap initial cluster metadata"""
        metadata_request = MetadataRequest([])
        for host, port, _ in self.hosts:
            log.debug("Attempting to bootstrap via node at %s:%s", host, port)

            try:
                bootstrap_conn = yield from create_conn(
                    host, port, loop=self._loop, client_id=self._client_id,
                    request_timeout_ms=self._request_timeout_ms)
            except (OSError, asyncio.TimeoutError) as err:
                log.error('Unable connect to "%s:%s": %s', host, port, err)
                continue

            try:
                metadata = yield from bootstrap_conn.send(metadata_request)
            except KafkaError as err:
                log.warning('Unable to request metadata from "%s:%s": %s',
                            host, port, err)
                bootstrap_conn.close()
                continue

            self.cluster.update_metadata(metadata)

            # A cluster with no topics can return no broker metadata
            # in that case, we should keep the bootstrap connection
            if not len(self.cluster.brokers()):
                self._conns['bootstrap'] = bootstrap_conn
            else:
                bootstrap_conn.close()

            log.debug('Received cluster metadata: %s', self.cluster)
            break
        else:
            raise ConnectionError(
                'Unable to bootstrap from {}'.format(self.hosts))

        if self._sync_task is None:
            # starting metadata synchronizer task
            self._sync_task = ensure_future(
                self._md_synchronizer(), loop=self._loop)
Beispiel #11
0
    def _sender_routine(self):
        """backgroud task that sends message batches to Kafka brokers"""
        tasks = set()
        try:
            while True:
                batches, unknown_leaders_exist = \
                    self._message_accumulator.drain_by_nodes(
                        ignore_nodes=self._in_flight)

                # create produce task for every batch
                for node_id, batches in batches.items():
                    task = ensure_future(
                        self._send_produce_req(node_id, batches),
                        loop=self._loop)
                    tasks.add(task)

                if unknown_leaders_exist:
                    # we have at least one unknown partition's leader,
                    # try to update cluster metadata and wait backoff time
                    self.client.force_metadata_update()
                    # Just to have at least 1 future in wait() call
                    fut = asyncio.sleep(self._retry_backoff, loop=self._loop)
                    waiters = tasks.union([fut])
                else:
                    fut = self._message_accumulator.data_waiter()
                    waiters = tasks.union([fut])

                # wait when:
                # * At least one of produce task is finished
                # * Data for new partition arrived
                done, _ = yield from asyncio.wait(
                    waiters,
                    return_when=asyncio.FIRST_COMPLETED,
                    loop=self._loop)
                tasks -= done

        except asyncio.CancelledError:
            pass
        except Exception:  # noqa
            log.error("Unexpected error in sender routine", exc_info=True)
Beispiel #12
0
    def _on_change_subscription(self):
        """ This is `group rebalanced` signal handler used to update fetch
            positions of assigned partitions
        """
        if self._closed:  # pragma: no cover
            return
        # fetch positions if we have partitions we're subscribed
        # to that we don't know the offset for
        if not self._subscription.has_all_fetch_positions():
            task = ensure_future(
                self._update_fetch_positions(
                    self._subscription.missing_fetch_positions()),
                loop=self._loop
            )
            self._pending_position_fetches.add(task)

            def on_done(fut, tasks=self._pending_position_fetches):
                tasks.discard(fut)
                try:
                    fut.result()
                except Exception as err:  # pragma: no cover
                    log.error("Failed to update fetch positions: %r", err)
            task.add_done_callback(on_done)
Beispiel #13
0
    def __init__(self, client, subscriptions, *, loop,
                 key_deserializer=None,
                 value_deserializer=None,
                 fetch_min_bytes=1,
                 fetch_max_wait_ms=500,
                 max_partition_fetch_bytes=1048576,
                 check_crcs=True,
                 fetcher_timeout=0.2,
                 prefetch_backoff=0.1):
        """Initialize a Kafka Message Fetcher.

        Parameters:
            client (AIOKafkaClient): kafka client
            subscription (SubscriptionState): instance of SubscriptionState
                located in kafka.consumer.subscription_state
            key_deserializer (callable): Any callable that takes a
                raw message key and returns a deserialized key.
            value_deserializer (callable, optional): Any callable that takes a
                raw message value and returns a deserialized value.
            fetch_min_bytes (int): Minimum amount of data the server should
                return for a fetch request, otherwise wait up to
                fetch_max_wait_ms for more data to accumulate. Default: 1.
            fetch_max_wait_ms (int): The maximum amount of time in milliseconds
                the server will block before answering the fetch request if
                there isn't sufficient data to immediately satisfy the
                requirement given by fetch_min_bytes. Default: 500.
            max_partition_fetch_bytes (int): The maximum amount of data
                per-partition the server will return. The maximum total memory
                used for a request = #partitions * max_partition_fetch_bytes.
                This size must be at least as large as the maximum message size
                the server allows or else it is possible for the producer to
                send messages larger than the consumer can fetch. If that
                happens, the consumer can get stuck trying to fetch a large
                message on a certain partition. Default: 1048576.
            check_crcs (bool): Automatically check the CRC32 of the records
                consumed. This ensures no on-the-wire or on-disk corruption to
                the messages occurred. This check adds some overhead, so it may
                be disabled in cases seeking extreme performance. Default: True
            fetcher_timeout (float): Maximum polling interval in the background
                fetching routine. Default: 0.2
            prefetch_backoff (float): number of seconds to wait until
                consumption of partition is paused. Paused partitions will not
                request new data from Kafka server (will not be included in
                next poll request).
        """
        self._client = client
        self._loop = loop
        self._key_deserializer = key_deserializer
        self._value_deserializer = value_deserializer
        self._fetch_min_bytes = fetch_min_bytes
        self._fetch_max_wait_ms = fetch_max_wait_ms
        self._max_partition_fetch_bytes = max_partition_fetch_bytes
        self._check_crcs = check_crcs
        self._fetcher_timeout = fetcher_timeout
        self._prefetch_backoff = prefetch_backoff
        self._subscriptions = subscriptions

        self._records = collections.OrderedDict()
        self._in_flight = set()
        self._fetch_tasks = set()

        self._wait_consume_future = None
        self._wait_empty_future = None

        self._fetch_task = ensure_future(
            self._fetch_requests_routine(), loop=loop)
Beispiel #14
0
    def _fetch_requests_routine(self):
        """ Background task, that always prefetches next result page.

        The algorithm:
        * Group partitions per node, which is the leader for it.
        * If all partitions for this node need prefetch - do it right alway
        * If any partition has some data (in `self._records`) wait up till
          `prefetch_backoff` so application can consume data from it.
        * If data in `self._records` is not consumed up to
          `prefetch_backoff` just request data for other partitions from this
          node.

        We request data in such manner cause Kafka blocks the connection if
        we perform a FetchRequest and we don't have enough data. This means
        we must perform a FetchRequest to as many partitions as we can in a
        node.

        Original java Kafka client processes data differently, as it only
        prefetches data if all messages were given to application (i.e. if
        `self._records` are empty). We don't use this method, cause we allow
        to process partitions separately (by passing `partitions` list to
        `getall()` call of the consumer), which can end up in a long wait
        if some partitions (or topics) are processed slower, than others.

        """
        try:
            while True:
                # Reset consuming signal future.
                self._wait_consume_future = asyncio.Future(loop=self._loop)
                # Create and send fetch requests
                requests, timeout = self._create_fetch_requests()
                for node_id, request in requests:
                    node_ready = yield from self._client.ready(node_id)
                    if not node_ready:
                        # We will request it on next routine
                        continue
                    log.debug("Sending FetchRequest to node %s", node_id)
                    task = ensure_future(
                        self._proc_fetch_request(node_id, request),
                        loop=self._loop)
                    self._fetch_tasks.add(task)
                    self._in_flight.add(node_id)

                done_set, _ = yield from asyncio.wait(
                    chain(self._fetch_tasks, [self._wait_consume_future]),
                    loop=self._loop,
                    timeout=timeout,
                    return_when=asyncio.FIRST_COMPLETED)

                # Process fetch tasks results if any
                done_fetches = self._fetch_tasks.intersection(done_set)
                if done_fetches:
                    has_new_data = any(fut.result() for fut in done_fetches)
                    if has_new_data:
                        # we added some messages to self._records,
                        # wake up getters
                        self._notify(self._wait_empty_future)
                    self._fetch_tasks -= done_fetches
        except asyncio.CancelledError:
            pass
        except Exception:  # noqa
            log.error("Unexpected error in fetcher routine", exc_info=True)
Beispiel #15
0
    def _sender_routine(self):
        """ Background task, that sends pending batches to leader nodes for
        batch's partition. This incapsulates same logic as Java's `Sender`
        background thread. Because we use asyncio this is more event based
        loop, rather than counting timeout till next possible even like in
        Java.

            The procedure:
            * Group pending batches by partition leaders (write nodes)
            * Ignore not ready (disconnected) and nodes, that already have a
              pending request.
            * If we have unknown leaders for partitions, we request a metadata
              update.
            * Wait for any event, that can change the above procedure, like
              new metadata or pending send is finished and a new one can be
              done.
        """
        tasks = set()
        try:
            while True:
                batches, unknown_leaders_exist = \
                    self._message_accumulator.drain_by_nodes(
                        ignore_nodes=self._in_flight)

                # create produce task for every batch
                for node_id, batches in batches.items():
                    task = ensure_future(self._send_produce_req(
                        node_id, batches),
                                         loop=self._loop)
                    self._in_flight.add(node_id)
                    tasks.add(task)

                if unknown_leaders_exist:
                    # we have at least one unknown partition's leader,
                    # try to update cluster metadata and wait backoff time
                    fut = self.client.force_metadata_update()
                    waiters = tasks.union([fut])
                else:
                    fut = self._message_accumulator.data_waiter()
                    waiters = tasks.union([fut])

                # wait when:
                # * At least one of produce task is finished
                # * Data for new partition arrived
                # * Metadata update if partition leader unknown
                done, _ = yield from asyncio.wait(
                    waiters,
                    return_when=asyncio.FIRST_COMPLETED,
                    loop=self._loop)

                # done tasks should never produce errors, if they are it's a
                # bug
                for task in done:
                    task.result()

                tasks -= done

        except asyncio.CancelledError:
            pass
        except Exception:  # pragma: no cover
            log.error("Unexpected error in sender routine", exc_info=True)
Beispiel #16
0
 def connect(self):
     future = asyncio.open_connection(self.host, self.port, loop=self._loop)
     self._reader, self._writer = yield from asyncio.wait_for(
         future, self._request_timeout, loop=self._loop)
     self._read_task = ensure_future(self._read(), loop=self._loop)
Beispiel #17
0
    def __init__(self,
                 client,
                 subscription,
                 *,
                 loop,
                 group_id='aiokafka-default-group',
                 session_timeout_ms=30000,
                 heartbeat_interval_ms=3000,
                 retry_backoff_ms=100,
                 enable_auto_commit=True,
                 auto_commit_interval_ms=5000,
                 assignors=(RoundRobinPartitionAssignor, )):
        """Initialize the coordination manager.

        Parameters:
            client (AIOKafkaClient): kafka client
            subscription (SubscriptionState): instance of SubscriptionState
                located in kafka.consumer.subscription_state
            group_id (str): name of the consumer group to join for dynamic
                partition assignment (if enabled), and to use for fetching and
                committing offsets. Default: 'aiokafka-default-group'
            enable_auto_commit (bool): If true the consumer's offset will be
                periodically committed in the background. Default: True.
            auto_commit_interval_ms (int): milliseconds between automatic
                offset commits, if enable_auto_commit is True. Default: 5000.
            assignors (list): List of objects to use to distribute partition
                ownership amongst consumer instances when group management is
                used. Default: [RoundRobinPartitionAssignor]
            heartbeat_interval_ms (int): The expected time in milliseconds
                between heartbeats to the consumer coordinator when using
                Kafka's group management feature. Heartbeats are used to ensure
                that the consumer's session stays active and to facilitate
                rebalancing when new consumers join or leave the group. The
                value must be set lower than session_timeout_ms, but typically
                should be set no higher than 1/3 of that value. It can be
                adjusted even lower to control the expected time for normal
                rebalances. Default: 3000
            session_timeout_ms (int): The timeout used to detect failures when
                using Kafka's group managementment facilities. Default: 30000
            retry_backoff_ms (int): Milliseconds to backoff when retrying on
                errors. Default: 100.
        """
        self._client = client
        self._session_timeout_ms = session_timeout_ms
        self._heartbeat_interval_ms = heartbeat_interval_ms
        self._retry_backoff_ms = retry_backoff_ms
        self.generation = OffsetCommitRequest.DEFAULT_GENERATION_ID
        self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID
        self.group_id = group_id
        self.coordinator_id = None
        self.rejoin_needed = True
        self.needs_join_prepare = True
        self.loop = loop
        # rejoin group can be called in parallel
        # (from consumer and from heartbeat task), so we need lock
        self._rejoin_lock = asyncio.Lock(loop=loop)
        self._enable_auto_commit = enable_auto_commit
        self._auto_commit_interval_ms = auto_commit_interval_ms
        self._assignors = assignors
        self._subscription = subscription
        self._partitions_per_topic = {}
        self._cluster = client.cluster
        self._auto_commit_task = None
        # _closing future used as a signal to heartbeat task for finish ASAP
        self._closing = asyncio.Future(loop=loop)
        # update subscribe state usint currently known metadata
        self._handle_metadata_update(client.cluster)
        self._cluster.add_listener(self._handle_metadata_update)
        self._group_rebalanced_callback = None

        self.heartbeat_task = ensure_future(self._heartbeat_task_routine(),
                                            loop=loop)

        if self._enable_auto_commit:
            interval = self._auto_commit_interval_ms / 1000
            self._auto_commit_task = ensure_future(
                self.auto_commit_routine(interval), loop=loop)
Beispiel #18
0
    def _fetch_requests_routine(self):
        """ Background task, that always prefetches next result page.

        The algorithm:
        * Group partitions per node, which is the leader for it.
        * If all partitions for this node need prefetch - do it right away
        * If any partition has some data (in `self._records`) wait up till
          `prefetch_backoff` so application can consume data from it.
        * If data in `self._records` is not consumed up to
          `prefetch_backoff` just request data for other partitions from this
          node.

        We request data in such manner cause Kafka blocks the connection if
        we perform a FetchRequest and we don't have enough data. This means
        we must perform a FetchRequest to as many partitions as we can in a
        node.

        Original Java Kafka client processes data differently, as it only
        prefetches data if all messages were given to application (i.e. if
        `self._records` are empty). We don't use this method, cause we allow
        to process partitions separately (by passing `partitions` list to
        `getall()` call of the consumer), which can end up in a long wait
        if some partitions (or topics) are processed slower, than others.

        """
        try:
            while True:
                # Reset consuming signal future.
                self._wait_consume_future = asyncio.Future(loop=self._loop)
                # Create and send fetch requests
                requests, timeout = self._create_fetch_requests()
                for node_id, request in requests:
                    node_ready = yield from self._client.ready(node_id)
                    if not node_ready:
                        # We will request it on next routine
                        continue
                    log.debug("Sending FetchRequest to node %s", node_id)
                    task = ensure_future(self._proc_fetch_request(
                        node_id, request),
                                         loop=self._loop)
                    self._fetch_tasks.add(task)
                    self._in_flight.add(node_id)

                done_set, _ = yield from asyncio.wait(
                    chain(self._fetch_tasks, [self._wait_consume_future]),
                    loop=self._loop,
                    timeout=timeout,
                    return_when=asyncio.FIRST_COMPLETED)

                # Process fetch tasks results if any
                done_fetches = self._fetch_tasks.intersection(done_set)
                if done_fetches:
                    has_new_data = any(fut.result() for fut in done_fetches)
                    if has_new_data:
                        # we added some messages to self._records,
                        # wake up getters
                        self._notify(self._wait_empty_future)
                    self._fetch_tasks -= done_fetches
        except asyncio.CancelledError:
            pass
        except Exception:  # pragma: no cover
            log.error("Unexpected error in fetcher routine", exc_info=True)
Beispiel #19
0
    def __init__(self,
                 client,
                 subscriptions,
                 *,
                 loop,
                 key_deserializer=None,
                 value_deserializer=None,
                 fetch_min_bytes=1,
                 fetch_max_wait_ms=500,
                 max_partition_fetch_bytes=1048576,
                 check_crcs=True,
                 fetcher_timeout=0.2,
                 prefetch_backoff=0.1,
                 retry_backoff_ms=100):
        """Initialize a Kafka Message Fetcher.

        Parameters:
            client (AIOKafkaClient): kafka client
            subscription (SubscriptionState): instance of SubscriptionState
                located in kafka.consumer.subscription_state
            key_deserializer (callable): Any callable that takes a
                raw message key and returns a deserialized key.
            value_deserializer (callable, optional): Any callable that takes a
                raw message value and returns a deserialized value.
            fetch_min_bytes (int): Minimum amount of data the server should
                return for a fetch request, otherwise wait up to
                fetch_max_wait_ms for more data to accumulate. Default: 1.
            fetch_max_wait_ms (int): The maximum amount of time in milliseconds
                the server will block before answering the fetch request if
                there isn't sufficient data to immediately satisfy the
                requirement given by fetch_min_bytes. Default: 500.
            max_partition_fetch_bytes (int): The maximum amount of data
                per-partition the server will return. The maximum total memory
                used for a request = #partitions * max_partition_fetch_bytes.
                This size must be at least as large as the maximum message size
                the server allows or else it is possible for the producer to
                send messages larger than the consumer can fetch. If that
                happens, the consumer can get stuck trying to fetch a large
                message on a certain partition. Default: 1048576.
            check_crcs (bool): Automatically check the CRC32 of the records
                consumed. This ensures no on-the-wire or on-disk corruption to
                the messages occurred. This check adds some overhead, so it may
                be disabled in cases seeking extreme performance. Default: True
            fetcher_timeout (float): Maximum polling interval in the background
                fetching routine. Default: 0.2
            prefetch_backoff (float): number of seconds to wait until
                consumption of partition is paused. Paused partitions will not
                request new data from Kafka server (will not be included in
                next poll request).
        """
        self._client = client
        self._loop = loop
        self._key_deserializer = key_deserializer
        self._value_deserializer = value_deserializer
        self._fetch_min_bytes = fetch_min_bytes
        self._fetch_max_wait_ms = fetch_max_wait_ms
        self._max_partition_fetch_bytes = max_partition_fetch_bytes
        self._check_crcs = check_crcs
        self._fetcher_timeout = fetcher_timeout
        self._prefetch_backoff = prefetch_backoff
        self._retry_backoff = retry_backoff_ms / 1000
        self._subscriptions = subscriptions

        self._records = collections.OrderedDict()
        self._in_flight = set()
        self._fetch_tasks = set()

        self._wait_consume_future = None
        self._wait_empty_future = None

        req_version = 2 if client.api_version >= (0, 10) else 1
        self._fetch_request_class = FetchRequest[req_version]

        self._fetch_task = ensure_future(self._fetch_requests_routine(),
                                         loop=loop)
Beispiel #20
0
    def __init__(self, client, subscription, *, loop,
                 group_id='aiokafka-default-group',
                 session_timeout_ms=30000, heartbeat_interval_ms=3000,
                 retry_backoff_ms=100,
                 enable_auto_commit=True, auto_commit_interval_ms=5000,
                 assignors=(RoundRobinPartitionAssignor,)
                 ):
        """Initialize the coordination manager.

        Parameters:
            client (AIOKafkaClient): kafka client
            subscription (SubscriptionState): instance of SubscriptionState
                located in kafka.consumer.subscription_state
            group_id (str): name of the consumer group to join for dynamic
                partition assignment (if enabled), and to use for fetching and
                committing offsets. Default: 'aiokafka-default-group'
            enable_auto_commit (bool): If true the consumer's offset will be
                periodically committed in the background. Default: True.
            auto_commit_interval_ms (int): milliseconds between automatic
                offset commits, if enable_auto_commit is True. Default: 5000.
            assignors (list): List of objects to use to distribute partition
                ownership amongst consumer instances when group management is
                used. Default: [RoundRobinPartitionAssignor]
            heartbeat_interval_ms (int): The expected time in milliseconds
                between heartbeats to the consumer coordinator when using
                Kafka's group management feature. Heartbeats are used to ensure
                that the consumer's session stays active and to facilitate
                rebalancing when new consumers join or leave the group. The
                value must be set lower than session_timeout_ms, but typically
                should be set no higher than 1/3 of that value. It can be
                adjusted even lower to control the expected time for normal
                rebalances. Default: 3000
            session_timeout_ms (int): The timeout used to detect failures when
                using Kafka's group managementment facilities. Default: 30000
            retry_backoff_ms (int): Milliseconds to backoff when retrying on
                errors. Default: 100.
        """
        self._client = client
        self._session_timeout_ms = session_timeout_ms
        self._heartbeat_interval_ms = heartbeat_interval_ms
        self._retry_backoff_ms = retry_backoff_ms
        self.generation = OffsetCommitRequest.DEFAULT_GENERATION_ID
        self.member_id = JoinGroupRequest.UNKNOWN_MEMBER_ID
        self.group_id = group_id
        self.coordinator_id = None
        self.rejoin_needed = True
        self.needs_join_prepare = True
        self.loop = loop
        # rejoin group can be called in parallel
        # (from consumer and from heartbeat task), so we need lock
        self._rejoin_lock = asyncio.Lock(loop=loop)
        self._enable_auto_commit = enable_auto_commit
        self._auto_commit_interval_ms = auto_commit_interval_ms
        self._assignors = assignors
        self._subscription = subscription
        self._partitions_per_topic = {}
        self._cluster = client.cluster
        self._auto_commit_task = None
        # _closing future used as a signal to heartbeat task for finish ASAP
        self._closing = asyncio.Future(loop=loop)
        # update subscribe state usint currently known metadata
        self._handle_metadata_update(client.cluster)
        self._cluster.add_listener(self._handle_metadata_update)
        self._group_rebalanced_callback = None

        self.heartbeat_task = ensure_future(
            self._heartbeat_task_routine(), loop=loop)

        if self._enable_auto_commit:
            interval = self._auto_commit_interval_ms / 1000
            self._auto_commit_task = ensure_future(
                self.auto_commit_routine(interval), loop=loop)
Beispiel #21
0
 def connect(self):
     future = asyncio.open_connection(self.host, self.port, loop=self._loop)
     self._reader, self._writer = yield from asyncio.wait_for(
         future, self._request_timeout, loop=self._loop)
     self._read_task = ensure_future(self._read(), loop=self._loop)