Ejemplo n.º 1
0
    def __init__(self,
                 client,
                 group,
                 topic,
                 partitions=None,
                 auto_commit=True,
                 auto_commit_every_n=AUTO_COMMIT_MSG_COUNT,
                 auto_commit_every_t=AUTO_COMMIT_INTERVAL):

        self.client = client
        self.topic = kafka_bytestring(topic)
        self.group = None if group is None else kafka_bytestring(group)
        self.client.load_metadata_for_topics(topic)
        self.offsets = {}

        if partitions is None:
            partitions = self.client.get_partition_ids_for_topic(topic)
        else:
            assert all(isinstance(x, numbers.Integral) for x in partitions)

        # Variables for handling offset commits
        self.commit_lock = Lock()
        self.commit_timer = None
        self.count_since_commit = 0
        self.auto_commit = auto_commit
        self.auto_commit_every_n = auto_commit_every_n
        self.auto_commit_every_t = auto_commit_every_t

        # Set up the auto-commit timer
        if auto_commit is True and auto_commit_every_t is not None:
            self.commit_timer = ReentrantTimer(auto_commit_every_t,
                                               self.commit)
            self.commit_timer.start()

        # Set initial offsets
        if self.group is not None:
            self.fetch_last_known_offsets(partitions)
        else:
            for partition in partitions:
                self.offsets[partition] = 0

        # Register a cleanup handler
        def cleanup(obj):
            obj.stop()

        self._cleanup_func = cleanup
        atexit.register(cleanup, self)

        self.partition_info = False  # Do not return partition info in msgs
Ejemplo n.º 2
0
    def __init__(self, client, group, topic, partitions=None, auto_commit=True,
                 auto_commit_every_n=AUTO_COMMIT_MSG_COUNT,
                 auto_commit_every_t=AUTO_COMMIT_INTERVAL):

        self.client = client
        self.topic = kafka_bytestring(topic)
        self.group = None if group is None else kafka_bytestring(group)
        self.client.load_metadata_for_topics(topic)
        self.offsets = {}

        if partitions is None:
            partitions = self.client.get_partition_ids_for_topic(topic)
        else:
            assert all(isinstance(x, numbers.Integral) for x in partitions)

        # Variables for handling offset commits
        self.commit_lock = Lock()
        self.commit_timer = None
        self.count_since_commit = 0
        self.auto_commit = auto_commit
        self.auto_commit_every_n = auto_commit_every_n
        self.auto_commit_every_t = auto_commit_every_t

        # Set up the auto-commit timer
        if auto_commit is True and auto_commit_every_t is not None:
            self.commit_timer = ReentrantTimer(auto_commit_every_t,
                                               self.commit)
            self.commit_timer.start()

        # Set initial offsets
        if self.group is not None:
            self.fetch_last_known_offsets(partitions)
        else:
            for partition in partitions:
                self.offsets[partition] = 0

        # Register a cleanup handler
        def cleanup(obj):
            obj.stop()
        self._cleanup_func = cleanup
        atexit.register(cleanup, self)

        self.partition_info = False     # Do not return partition info in msgs
Ejemplo n.º 3
0
    def _consume_topic_partition(self, topic, partition):
        topic = kafka_bytestring(topic)
        if not isinstance(partition, int):
            raise KafkaConfigurationError('Unknown partition type (%s) '
                                          '-- expected int' % type(partition))

        if topic not in self._client.topic_partitions:
            raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic)
        if partition not in self._client.get_partition_ids_for_topic(topic):
            raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s "
                                               "in broker metadata" % (partition, topic))
        logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition)
        self._topics.append((topic, partition))
Ejemplo n.º 4
0
    def __init__(self, hosts, client_id=CLIENT_ID,
                 timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS,
                 correlation_id=0):
        # We need one connection to bootstrap
        self.client_id = kafka_bytestring(client_id)
        self.timeout = timeout
        self.hosts = collect_hosts(hosts)
        self.correlation_id = correlation_id

        # create connections only when we need them
        self.conns = {}
        self.brokers = {}            # broker_id -> BrokerMetadata
        self.topics_to_brokers = {}  # TopicAndPartition -> BrokerMetadata
        self.topic_partitions = {}   # topic -> partition -> PartitionMetadata

        self.load_metadata_for_topics()  # bootstrap with all metadata
Ejemplo n.º 5
0
    def __init__(self,
                 hosts,
                 client_id=CLIENT_ID,
                 timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS,
                 correlation_id=0):
        # We need one connection to bootstrap
        self.client_id = kafka_bytestring(client_id)
        self.timeout = timeout
        self.hosts = collect_hosts(hosts)
        self.correlation_id = correlation_id

        # create connections only when we need them
        self.conns = {}
        self.brokers = {}  # broker_id -> BrokerMetadata
        self.topics_to_brokers = {}  # TopicAndPartition -> BrokerMetadata
        self.topic_partitions = {}  # topic -> partition -> PartitionMetadata

        self.load_metadata_for_topics()  # bootstrap with all metadata
Ejemplo n.º 6
0
    def send_messages(self, topic, partition, *msg):
        """
        Helper method to send produce requests
        @param: topic, name of topic for produce request -- type str
        @param: partition, partition number for produce request -- type int
        @param: *msg, one or more message payloads -- type bytes
        @returns: ResponseRequest returned by server
        raises on error

        Note that msg type *must* be encoded to bytes by user.
        Passing unicode message will not work, for example
        you should encode before calling send_messages via
        something like `unicode_message.encode('utf-8')`

        All messages produced via this method will set the message 'key' to Null
        """
        topic = kafka_bytestring(topic)
        return self._send_messages(topic, partition, *msg)
Ejemplo n.º 7
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp,) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset
Ejemplo n.º 8
0
    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError(
                'Attempted to commit offsets '
                'without a configured consumer group (group_id)'
            )

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(
                OffsetCommitRequest(topic_partition[0], topic_partition[1],
                                    commit_offset, metadata)
            )

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])
            resps = self._client.send_offset_commit_request(
                kafka_bytestring(self._config['group_id']), commits,
                fail_on_error=False
            )

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False
Ejemplo n.º 9
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages'
            )

        fetches = [FetchRequest(topic, partition,
                                self._offsets.fetch[(topic, partition)],
                                max_bytes)
                   for (topic, partition) in self._topics]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False
        )

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning('OffsetOutOfRange: topic %s, partition %d, '
                               'offset %d (Highwatermark: %d)',
                               topic, partition,
                               self._offsets.fetch[(topic, partition)],
                               resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition))
                )
                continue

            except NotLeaderForPartitionError:
                logger.warning("NotLeaderForPartitionError for %s - %d. "
                               "Metadata may be out of date",
                               topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d",
                               topic, partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug('message offset less than fetched offset '
                                 'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Ejemplo n.º 10
0
    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = kafka_bytestring(arg)

                for partition in self._client.get_partition_ids_for_topic(topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = kafka_bytestring(arg[0])
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = kafka_bytestring(key)

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)'
                            )

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = kafka_bytestring(key[0])
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[topic_partition] = self._offsets.commit[topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()
Ejemplo n.º 11
0
    def load_metadata_for_topics(self, *topics):
        """
        Fetch broker and topic-partition metadata from the server,
        and update internal data:
        broker list, topic/partition list, and topic/parition -> broker map

        This method should be called after receiving any error

        Arguments:
            *topics (optional): If a list of topics is provided,
                the metadata refresh will be limited to the specified topics only.

        Exceptions:
        ----------
        If the broker is configured to not auto-create topics,
        expect UnknownTopicOrPartitionError for topics that don't exist

        If the broker is configured to auto-create topics,
        expect LeaderNotAvailableError for new topics
        until partitions have been initialized.

        Exceptions *will not* be raised in a full refresh (i.e. no topic list)
        In this case, error codes will be logged as errors

        Partition-level errors will also not be raised here
        (a single partition w/o a leader, for example)
        """
        topics = [kafka_bytestring(t) for t in topics]

        if topics:
            for topic in topics:
                self.reset_topic_metadata(topic)
        else:
            self.reset_all_metadata()

        resp = self.send_metadata_request(topics)

        log.debug('Updating broker metadata: %s', resp.brokers)
        log.debug('Updating topic metadata: %s', resp.topics)

        self.brokers = dict([(broker.nodeId, broker)
                             for broker in resp.brokers])

        for topic_metadata in resp.topics:
            topic = topic_metadata.topic
            partitions = topic_metadata.partitions

            # Errors expected for new topics
            try:
                kafka_common.check_error(topic_metadata)
            except (UnknownTopicOrPartitionError,
                    LeaderNotAvailableError) as e:

                # Raise if the topic was passed in explicitly
                if topic in topics:
                    raise

                # Otherwise, just log a warning
                log.error('Error loading topic metadata for %s: %s', topic,
                          type(e))
                continue

            self.topic_partitions[topic] = {}
            for partition_metadata in partitions:
                partition = partition_metadata.partition
                leader = partition_metadata.leader

                self.topic_partitions[topic][partition] = partition_metadata

                # Populate topics_to_brokers dict
                topic_part = TopicAndPartition(topic, partition)

                # Check for partition errors
                try:
                    kafka_common.check_error(partition_metadata)

                # If No Leader, topics_to_brokers topic_partition -> None
                except LeaderNotAvailableError:
                    log.error('No leader for topic %s partition %d', topic,
                              partition)
                    self.topics_to_brokers[topic_part] = None
                    continue
                # If one of the replicas is unavailable -- ignore
                # this error code is provided for admin purposes only
                # we never talk to replicas, only the leader
                except ReplicaNotAvailableError:
                    log.debug(
                        'Some (non-leader) replicas not available for topic %s partition %d',
                        topic, partition)

                # If Known Broker, topic_partition -> BrokerMetadata
                if leader in self.brokers:
                    self.topics_to_brokers[topic_part] = self.brokers[leader]

                # If Unknown Broker, fake BrokerMetadata so we dont lose the id
                # (not sure how this could happen. server could be in bad state)
                else:
                    self.topics_to_brokers[topic_part] = BrokerMetadata(
                        leader, None, None)
Ejemplo n.º 12
0
    def get_partition_ids_for_topic(self, topic):
        topic = kafka_bytestring(topic)
        if topic not in self.topic_partitions:
            return []

        return sorted(list(self.topic_partitions[topic]))
Ejemplo n.º 13
0
 def has_metadata_for_topic(self, topic):
     topic = kafka_bytestring(topic)
     return topic in self.topic_partitions and len(
         self.topic_partitions[topic]) > 0
Ejemplo n.º 14
0
    def load_metadata_for_topics(self, *topics):
        """
        Fetch broker and topic-partition metadata from the server,
        and update internal data:
        broker list, topic/partition list, and topic/parition -> broker map

        This method should be called after receiving any error

        Arguments:
            *topics (optional): If a list of topics is provided,
                the metadata refresh will be limited to the specified topics only.

        Exceptions:
        ----------
        If the broker is configured to not auto-create topics,
        expect UnknownTopicOrPartitionError for topics that don't exist

        If the broker is configured to auto-create topics,
        expect LeaderNotAvailableError for new topics
        until partitions have been initialized.

        Exceptions *will not* be raised in a full refresh (i.e. no topic list)
        In this case, error codes will be logged as errors

        Partition-level errors will also not be raised here
        (a single partition w/o a leader, for example)
        """
        topics = [kafka_bytestring(t) for t in topics]

        if topics:
            for topic in topics:
                self.reset_topic_metadata(topic)
        else:
            self.reset_all_metadata()

        resp = self.send_metadata_request(topics)

        log.debug('Updating broker metadata: %s', resp.brokers)
        log.debug('Updating topic metadata: %s', resp.topics)

        self.brokers = dict([(broker.nodeId, broker)
                             for broker in resp.brokers])

        for topic_metadata in resp.topics:
            topic = topic_metadata.topic
            partitions = topic_metadata.partitions

            # Errors expected for new topics
            try:
                kafka_common.check_error(topic_metadata)
            except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e:

                # Raise if the topic was passed in explicitly
                if topic in topics:
                    raise

                # Otherwise, just log a warning
                log.error('Error loading topic metadata for %s: %s', topic, type(e))
                continue

            self.topic_partitions[topic] = {}
            for partition_metadata in partitions:
                partition = partition_metadata.partition
                leader = partition_metadata.leader

                self.topic_partitions[topic][partition] = partition_metadata

                # Populate topics_to_brokers dict
                topic_part = TopicAndPartition(topic, partition)

                # Check for partition errors
                try:
                    kafka_common.check_error(partition_metadata)

                # If No Leader, topics_to_brokers topic_partition -> None
                except LeaderNotAvailableError:
                    log.error('No leader for topic %s partition %d', topic, partition)
                    self.topics_to_brokers[topic_part] = None
                    continue
                # If one of the replicas is unavailable -- ignore
                # this error code is provided for admin purposes only
                # we never talk to replicas, only the leader
                except ReplicaNotAvailableError:
                    log.debug('Some (non-leader) replicas not available for topic %s partition %d',
                              topic, partition)

                # If Known Broker, topic_partition -> BrokerMetadata
                if leader in self.brokers:
                    self.topics_to_brokers[topic_part] = self.brokers[leader]

                # If Unknown Broker, fake BrokerMetadata so we dont lose the id
                # (not sure how this could happen. server could be in bad state)
                else:
                    self.topics_to_brokers[topic_part] = BrokerMetadata(
                        leader, None, None
                    )
Ejemplo n.º 15
0
    def get_partition_ids_for_topic(self, topic):
        topic = kafka_bytestring(topic)
        if topic not in self.topic_partitions:
            return []

        return sorted(list(self.topic_partitions[topic]))
Ejemplo n.º 16
0
 def has_metadata_for_topic(self, topic):
     topic = kafka_bytestring(topic)
     return topic in self.topic_partitions and len(self.topic_partitions[topic]) > 0