Esempio n. 1
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = [
            FetchRequest(topic, partition,
                         self._offsets.fetch[(topic, partition)], max_bytes)
            for (topic, partition) in self._topics
        ]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False)

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, '
                    'offset %d (Highwatermark: %d)', topic, partition,
                    self._offsets.fetch[(topic, partition)],
                    resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition)))
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", topic,
                               partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug(
                        'message offset less than fetched offset '
                        'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Esempio n. 2
0
    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = kafka_bytestring(arg)

                for partition in self._client.get_partition_ids_for_topic(
                        topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = kafka_bytestring(arg[0])
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = kafka_bytestring(key)

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)')

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = kafka_bytestring(key[0])
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' %
                                              type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[
                        topic_partition] = self._offsets.commit[
                            topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[
                        topic_partition] = self._reset_partition_offset(
                            topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()
Esempio n. 3
0
    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError(
                'Attempted to commit offsets '
                'without a configured consumer group (group_id)')

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(
                OffsetCommitRequestPayload(topic_partition[0],
                                           topic_partition[1], commit_offset,
                                           metadata))

        if commits:
            logger.info('committing consumer offsets to group %s',
                        self._config['group_id'])

            resps = []
            if self._config['offset_storage'] in ['zookeeper', 'dual']:
                resps += self._client.send_offset_commit_request(
                    self._config['group_id'],
                    commits,
                    fail_on_error=False,
                )
            if self._config['offset_storage'] in ['kafka', 'dual']:
                resps += self._client.send_offset_commit_request_kafka(
                    self._config['group_id'],
                    commits,
                    fail_on_error=False,
                )

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s',
                        self._config['group_id'])
            return False
Esempio n. 4
0
    def configure(self, **configs):
        """Configure the consumer instance

        Configuration settings can be passed to constructor,
        otherwise defaults will be used:

        Keyword Arguments:
            bootstrap_servers (list): List of initial broker nodes the consumer
                should contact to bootstrap initial cluster metadata.  This does
                not have to be the full node list.  It just needs to have at
                least one broker that will respond to a Metadata API Request.
            client_id (str): a unique name for this client.  Defaults to
                'kafka.consumer.kafka'.
            group_id (str): the name of the consumer group to join,
                Offsets are fetched / committed to this group name.
            fetch_message_max_bytes (int, optional): Maximum bytes for each
                topic/partition fetch request.  Defaults to 1024*1024.
            fetch_min_bytes (int, optional): Minimum amount of data the server
                should return for a fetch request, otherwise wait up to
                fetch_wait_max_ms for more data to accumulate.  Defaults to 1.
            fetch_wait_max_ms (int, optional): Maximum time for the server to
                block waiting for fetch_min_bytes messages to accumulate.
                Defaults to 100.
            refresh_leader_backoff_ms (int, optional): Milliseconds to backoff
                when refreshing metadata on errors (subject to random jitter).
                Defaults to 200.
            socket_timeout_ms (int, optional): TCP socket timeout in
                milliseconds.  Defaults to 30*1000.
            auto_offset_reset (str, optional): A policy for resetting offsets on
                OffsetOutOfRange errors. 'smallest' will move to the oldest
                available message, 'largest' will move to the most recent.  Any
                ofther value will raise the exception.  Defaults to 'largest'.
            deserializer_class (callable, optional):  Any callable that takes a
                raw message value and returns a deserialized value.  Defaults to
                 lambda msg: msg.
            auto_commit_enable (bool, optional): Enabling auto-commit will cause
                the KafkaConsumer to periodically commit offsets without an
                explicit call to commit().  Defaults to False.
            auto_commit_interval_ms (int, optional):  If auto_commit_enabled,
                the milliseconds between automatic offset commits.  Defaults to
                60 * 1000.
            auto_commit_interval_messages (int, optional): If
                auto_commit_enabled, a number of messages consumed between
                automatic offset commits.  Defaults to None (disabled).
            consumer_timeout_ms (int, optional): number of millisecond to throw
                a timeout exception to the consumer if no message is available
                for consumption.  Defaults to -1 (dont throw exception).

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        configs = self._deprecate_configs(**configs)
        self._config = {}
        for key in self.DEFAULT_CONFIG:
            self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit '
                    'without required consumer group (group_id)')

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if not self._config['bootstrap_servers']:
            raise KafkaConfigurationError(
                'bootstrap_servers required to configure KafkaConsumer')

        reporters = [self._config['metrics_reporter']()] if \
            self._config['metrics_reporter'] else []
        metrics = Metrics(reporters=reporters)
        self.metrics = KafkaConsumerMetrics(metrics)

        self._client = SimpleClient(
            self._config['bootstrap_servers'],
            client_id=self._config['client_id'],
            timeout=(self._config['socket_timeout_ms'] / 1000.0),
            metrics=metrics,
        )
Esempio n. 5
0
    def fetch_messages(self):
        """
        Sends FetchRequests for all topic/partitions set for consumption
        Returns a generator that yields KafkaMessage structs
        after deserializing with the configured `deserializer_class`

        Refreshes metadata on errors, and resets fetch offset on
        OffsetOutOfRange, per the configured `auto_offset_reset` policy

        Key configuration parameters:

        * `fetch_message_max_bytes`
        * `fetch_max_wait_ms`
        * `fetch_min_bytes`
        * `deserializer_class`
        * `auto_offset_reset`
        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        # Get current fetch offsets
        offsets = self._offsets.fetch
        if not offsets:
            if not self._topics:
                raise KafkaConfigurationError(
                    'No topics or partitions configured')
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = []
        for topic_partition, offset in six.iteritems(offsets):
            fetches.append(
                FetchRequest(topic_partition[0], topic_partition[1], offset,
                             max_bytes))

        # client.send_fetch_request will collect topic/partition requests by leader
        # and send each group as a single FetchRequest to the correct broker
        try:
            responses = self._client.send_fetch_request(
                fetches,
                max_wait_time=max_wait_time,
                min_bytes=min_bytes,
                fail_on_error=False)
        except FailedPayloadsError:
            logger.warning(
                'FailedPayloadsError attempting to fetch data from kafka')
            self._refresh_metadata_on_error()
            return

        for resp in responses:
            topic_partition = (resp.topic, resp.partition)
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, offset %d '
                    '(Highwatermark: %d)', resp.topic, resp.partition,
                    offsets[topic_partition], resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[
                    topic_partition] = self._reset_partition_offset(
                        topic_partition)
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", resp.topic, resp.partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", resp.topic,
                               resp.partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[topic_partition] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                msg = KafkaMessage(
                    resp.topic, resp.partition, offset, message.key,
                    self._config['deserializer_class'](message.value))

                if offset < self._offsets.fetch[topic_partition]:
                    logger.debug(
                        'Skipping message %s because its offset is less than the consumer offset',
                        msg)
                    continue
                # Only increment fetch offset if we safely got the message and deserialized
                self._offsets.fetch[topic_partition] = offset + 1

                # Then yield to user
                yield msg