Esempio n. 1
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            resps = []
            if self._config['offset_storage'] in ('zookeeper', 'dual'):
                resps += self._client.send_offset_fetch_request(
                    kafka_bytestring(self._config['group_id']),
                    [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                    fail_on_error=False)
            if self._config['offset_storage'] in ('kafka', 'dual'):
                resps += self._client.send_offset_fetch_request_kafka(
                    kafka_bytestring(self._config['group_id']),
                    [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                    fail_on_error=False)
            try:
                for r in resps:
                  check_error(r)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            max_offset = max(r.offset for r in resps)
            if max_offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = max_offset
Esempio n. 2
0
def _get_topic_offsets(topics, latest):
    """
    :param topics: list of topics
    :param latest: True to fetch latest offsets, False to fetch earliest available
    :return: dict: { (topic, partition): offset, ... }
    """

    # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetRequest
    # https://cfchou.github.io/blog/2015/04/23/a-closer-look-at-kafka-offsetrequest/
    assert set(topics) <= set(ALL)
    client = get_kafka_client()
    partition_meta = client.topic_partitions

    # only return the offset of the latest message in the partition
    num_offsets = 1
    time_value = -1 if latest else -2

    offsets = {}
    offset_requests = []
    for topic in topics:
        partitions = list(partition_meta.get(topic, {}))
        for partition in partitions:
            offsets[(kafka_bytestring(topic), partition)] = None
            offset_requests.append(
                OffsetRequest(kafka_bytestring(topic), partition, time_value,
                              num_offsets))

    responses = client.send_offset_request(offset_requests)
    for r in responses:
        offsets[(kafka_bytestring(r.topic), r.partition)] = r.offsets[0]

    return offsets
Esempio n. 3
0
def _commit_offsets_to_watermark(
    kafka_client,
    group,
    topics,
    watermark,
    raise_on_error,
    offset_storage,
):
    topics = _verify_topics_and_partitions(kafka_client, topics, raise_on_error)

    watermark_offsets = get_topics_watermarks(kafka_client, topics, raise_on_error)

    if watermark == HIGH_WATERMARK:
        group_offset_reqs = [
            OffsetCommitRequest(
                kafka_bytestring(topic), partition,
                watermark_offsets[topic][partition].highmark, None
            )
            for topic, partitions in topics.iteritems()
            for partition in partitions
        ]
    elif watermark == LOW_WATERMARK:
        group_offset_reqs = [
            OffsetCommitRequest(
                kafka_bytestring(topic), partition,
                watermark_offsets[topic][partition].lowmark, None
            )
            for topic, partitions in topics.iteritems()
            for partition in partitions
        ]
    else:
        raise ValueError(
            "Unknown watermark: {watermark}".format(watermark=watermark)
        )

    if offset_storage == 'zookeeper' or not offset_storage:
        send_api = kafka_client.send_offset_commit_request
    elif offset_storage == 'kafka':
        send_api = kafka_client.send_offset_commit_request_kafka
    else:
        raise InvalidOffsetStorageError(offset_storage)

    status = []
    if group_offset_reqs:
        status = send_api(
            kafka_bytestring(group),
            group_offset_reqs,
            raise_on_error,
            callback=_check_commit_response_error
        )

    return filter(None, status)
Esempio n. 4
0
 def test_get_partitions_set(self, partitioner):
     with mock.patch('yelp_kafka.partitioner.get_kafka_topics',
                     autospec=True) as mock_topics:
         mock_topics.return_value = {
             kafka_bytestring('topic1'): [0, 1, 2, 3],
             kafka_bytestring('topic2'): [0, 1, 2],
             kafka_bytestring('topic3'): [0, 1, 2, 3],
         }
         actual = partitioner.get_partitions_set()
         assert actual == set([
             'topic1-0', 'topic1-1', 'topic1-2', 'topic1-3', 'topic2-0',
             'topic2-1', 'topic2-2'
         ])
Esempio n. 5
0
    def __init__(self,
                 client,
                 group,
                 topic,
                 partitions=None,
                 auto_commit=True,
                 auto_commit_every_n=AUTO_COMMIT_MSG_COUNT,
                 auto_commit_every_t=AUTO_COMMIT_INTERVAL,
                 offset_storage='zookeeper'):

        self.client = client
        self.topic = kafka_bytestring(topic)
        self.group = None if group is None else kafka_bytestring(group)
        self.client.load_metadata_for_topics(topic)
        self.offsets = {}

        if partitions is None:
            partitions = self.client.get_partition_ids_for_topic(topic)
        else:
            assert all(isinstance(x, numbers.Integral) for x in partitions)

        # Variables for handling offset commits
        self.commit_lock = Lock()
        self.commit_timer = None
        self.count_since_commit = 0
        self.auto_commit = auto_commit
        self.auto_commit_every_n = auto_commit_every_n
        self.auto_commit_every_t = auto_commit_every_t
        self.offset_storage = offset_storage

        # Set up the auto-commit timer
        if auto_commit is True and auto_commit_every_t is not None:
            self.commit_timer = ReentrantTimer(auto_commit_every_t,
                                               self.commit)
            self.commit_timer.start()

        # Set initial offsets
        if self.group is not None:
            self.fetch_last_known_offsets(partitions)
        else:
            for partition in partitions:
                self.offsets[partition] = 0

        # Register a cleanup handler
        def cleanup(obj):
            obj.stop()

        self._cleanup_func = cleanup
        atexit.register(cleanup, self)

        self.partition_info = False  # Do not return partition info in msgs
Esempio n. 6
0
def _commit_offsets_to_watermark(
    kafka_client,
    group,
    topics,
    watermark,
    raise_on_error,
    offset_storage,
):
    topics = _verify_topics_and_partitions(kafka_client, topics,
                                           raise_on_error)

    watermark_offsets = get_topics_watermarks(kafka_client, topics,
                                              raise_on_error)

    if watermark == HIGH_WATERMARK:
        group_offset_reqs = [
            OffsetCommitRequest(kafka_bytestring(topic), partition,
                                watermark_offsets[topic][partition].highmark,
                                None)
            for topic, partitions in topics.iteritems()
            for partition in partitions
        ]
    elif watermark == LOW_WATERMARK:
        group_offset_reqs = [
            OffsetCommitRequest(kafka_bytestring(topic), partition,
                                watermark_offsets[topic][partition].lowmark,
                                None)
            for topic, partitions in topics.iteritems()
            for partition in partitions
        ]
    else:
        raise ValueError(
            "Unknown watermark: {watermark}".format(watermark=watermark))

    if offset_storage == 'zookeeper' or not offset_storage:
        send_api = kafka_client.send_offset_commit_request
    elif offset_storage == 'kafka':
        send_api = kafka_client.send_offset_commit_request_kafka
    else:
        raise InvalidOffsetStorageError(offset_storage)

    status = []
    if group_offset_reqs:
        status = send_api(kafka_bytestring(group),
                          group_offset_reqs,
                          raise_on_error,
                          callback=_check_commit_response_error)

    return filter(None, status)
 def _send_offset_commit_requests(self, offset_commit_request_list):
     if len(offset_commit_request_list) > 0:
         retry_on_exception(self._consumer_retry_policy,
                            (FailedPayloadsError),
                            self.kafka_client.send_offset_commit_request,
                            group=kafka_bytestring(self.client_name),
                            payloads=offset_commit_request_list)
Esempio n. 8
0
    def run(self, topic, message, hosts=None):
        """
        Simple round-robin synchronous producer to send one message to one topic.

        :param hosts: Kafka hostname(s) to connect in host:port format.
                      Comma-separated for several hosts.
        :type hosts: ``str``
        :param topic: Kafka Topic to publish the message on.
        :type topic: ``str``
        :param message: The message to publish.
        :type message: ``str``

        :returns: Response data: `topic`, target `partition` where message was sent,
                  `offset` number and `error` code (hopefully 0).
        :rtype: ``dict``
        """

        if hosts:
            _hosts = hosts
        elif self.config.get('hosts', None):
            _hosts = self.config['hosts']
        else:
            raise ValueError("Need to define 'hosts' in either action or in config")

        # set default for empty value
        _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID

        client = KafkaClient(_hosts, client_id=_client_id)
        client.ensure_topic_exists(topic)
        producer = SimpleProducer(client)
        result = producer.send_messages(topic, kafka_bytestring(message))

        if result[0]:
            return result[0].__dict__
    def commit_offsets(self, topic_to_partition_offset_map):
        """Commits offset information to kafka.  Allows lower-level control for
        committing offsets.  In general, :meth:`commit_message` or
        :meth:`commit_messages` should be used, but this can be useful when paired with
        :meth:`data_pipeline.position_data.PositionData.topic_to_last_position_info_map`.

        **Example**::
            The `topic_to_partition_offset_map` should be formatted like::

                {
                  'topic1': {0: 83854, 1: 8943892},
                  'topic2': {0: 190898}
                }

        Args::
            topic_to_partition_offset_map (Dict[str, Dict[int, int]]): Maps from
                topics to a partition and offset map for each topic.
        """
        topic_to_partition_offset_map = self._get_offsets_map_to_be_committed(
            topic_to_partition_offset_map)
        return self._send_offset_commit_requests(offset_commit_request_list=[
            OffsetCommitRequest(topic=kafka_bytestring(topic),
                                partition=partition,
                                offset=offset,
                                metadata=None) for topic, partition_map in
            topic_to_partition_offset_map.iteritems()
            for partition, offset in partition_map.iteritems()
        ])
Esempio n. 10
0
    def get_partitions_set(self):
        """ Load partitions metadata from kafka and create
        a set containing "<topic>-<partition_id>"

        :returns: partitions for user topics
        :rtype: set
        :raises PartitionerError: if no partitions have been found
        """
        topic_partitions = get_kafka_topics(self.kafka_client)
        partitions = []
        missing_topics = set()
        for topic in self.topics:
            kafka_topic = kafka_bytestring(topic)
            if kafka_topic not in topic_partitions:
                missing_topics.add(topic)
            else:
                partitions += [
                    "{0}-{1}".format(topic, p)
                    for p in topic_partitions[kafka_topic]
                ]
        if missing_topics:
            self.log.info("Missing topics: %s", missing_topics)
        if not partitions:
            self.release_and_finish()
            raise PartitionerError(
                "No partitions found for topics: {topics}".format(
                    topics=self.topics))
        return set(partitions)
Esempio n. 11
0
    def run(self, topic, message, hosts=None):
        """
        Simple round-robin synchronous producer to send one message to one topic.

        :param hosts: Kafka hostname(s) to connect in host:port format.
                      Comma-separated for several hosts.
        :type hosts: ``str``
        :param topic: Kafka Topic to publish the message on.
        :type topic: ``str``
        :param message: The message to publish.
        :type message: ``str``

        :returns: Response data: `topic`, target `partition` where message was sent,
                  `offset` number and `error` code (hopefully 0).
        :rtype: ``dict``
        """

        if hosts:
            _hosts = hosts
        elif self.config.get('hosts', None):
            _hosts = self.config['hosts']
        else:
            raise ValueError(
                "Need to define 'hosts' in either action or in config")

        # set default for empty value
        _client_id = self.config.get('client_id') or self.DEFAULT_CLIENT_ID

        client = KafkaClient(_hosts, client_id=_client_id)
        client.ensure_topic_exists(topic)
        producer = SimpleProducer(client)
        result = producer.send_messages(topic, kafka_bytestring(message))

        if result[0]:
            return result[0].__dict__
Esempio n. 12
0
    def commit_offsets(self, topic_to_partition_offset_map):
        """Commits offset information to kafka.  Allows lower-level control for
        committing offsets.  In general, :meth:`commit_message` or
        :meth:`commit_messages` should be used, but this can be useful when paired with
        :meth:`data_pipeline.position_data.PositionData.topic_to_last_position_info_map`.

        **Example**::
            The `topic_to_partition_offset_map` should be formatted like::

                {
                  'topic1': {0: 83854, 1: 8943892},
                  'topic2': {0: 190898}
                }

        Args::
            topic_to_partition_offset_map (Dict[str, Dict[int, int]]): Maps from
                topics to a partition and offset map for each topic.
        """
        topic_to_partition_offset_map = self._get_offsets_map_to_be_committed(
            topic_to_partition_offset_map
        )
        return self._send_offset_commit_requests(
            offset_commit_request_list=[
                OffsetCommitRequest(
                    topic=kafka_bytestring(topic),
                    partition=partition,
                    offset=offset,
                    metadata=None
                ) for topic, partition_map in topic_to_partition_offset_map.iteritems()
                for partition, offset in partition_map.iteritems()
            ]
        )
Esempio n. 13
0
    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError('Attempted to commit offsets without a configured consumer group (group_id)')

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata))

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])
            resps = self._client.send_offset_commit_request(kafka_bytestring(self._config['group_id']),
                                                            commits,
                                                            fail_on_error=False)

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False
Esempio n. 14
0
    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError('Attempted to commit offsets without a configured consumer group (group_id)')

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata))

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])
            resps = self._client.send_offset_commit_request(kafka_bytestring(self._config['group_id']),
                                                            commits,
                                                            fail_on_error=False)

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False
Esempio n. 15
0
    def __init__(self, client, group, topic, partitions=None, auto_commit=True,
                 auto_commit_every_n=AUTO_COMMIT_MSG_COUNT,
                 auto_commit_every_t=AUTO_COMMIT_INTERVAL,
                 offset_storage='zookeeper'):

        self.client = client
        self.topic = kafka_bytestring(topic)
        self.group = None if group is None else kafka_bytestring(group)
        self.client.load_metadata_for_topics(topic)
        self.offsets = {}

        if partitions is None:
            partitions = self.client.get_partition_ids_for_topic(topic)
        else:
            assert all(isinstance(x, numbers.Integral) for x in partitions)

        # Variables for handling offset commits
        self.commit_lock = Lock()
        self.commit_timer = None
        self.count_since_commit = 0
        self.auto_commit = auto_commit
        self.auto_commit_every_n = auto_commit_every_n
        self.auto_commit_every_t = auto_commit_every_t
        self.offset_storage = offset_storage

        # Set up the auto-commit timer
        if auto_commit is True and auto_commit_every_t is not None:
            self.commit_timer = ReentrantTimer(auto_commit_every_t,
                                               self.commit)
            self.commit_timer.start()

        # Set initial offsets
        if self.group is not None:
            self.fetch_last_known_offsets(partitions)
        else:
            for partition in partitions:
                self.offsets[partition] = 0

        # Register a cleanup handler
        def cleanup(obj):
            obj.stop()
        self._cleanup_func = cleanup
        atexit.register(cleanup, self)

        self.partition_info = False     # Do not return partition info in msgs
Esempio n. 16
0
 def _send_offset_commit_requests(self, offset_commit_request_list):
     if len(offset_commit_request_list) > 0:
         retry_on_exception(
             self._consumer_retry_policy,
             (FailedPayloadsError),
             self.kafka_client.send_offset_commit_request,
             group=kafka_bytestring(self.client_name),
             payloads=offset_commit_request_list
         )
Esempio n. 17
0
 def current_offset(self, topic, partition):
     try:
         offsets, = self.client.send_offset_request([ OffsetRequest(kafka_bytestring(topic), partition, -1, 1) ])
     except:
         # XXX: We've seen some UnknownErrors here and cant debug w/o server logs
         self.zk.child.dump_logs()
         self.server.child.dump_logs()
         raise
     else:
         return offsets.offsets[0]
Esempio n. 18
0
 def __init__(self, topic, config, partitions=None):
     self.log = logging.getLogger(self.__class__.__name__)
     if not isinstance(topic, six.string_types):
         raise TypeError("Topic must be a string")
     self.topic = kafka_bytestring(topic)
     if partitions and not isinstance(partitions, list):
         raise TypeError("Partitions must be a list")
     self.partitions = partitions
     self.kafka_consumer = None
     self.config = config
Esempio n. 19
0
    def _consume_topic_partition(self, topic, partition):
        topic = kafka_bytestring(topic)
        if not isinstance(partition, int):
            raise KafkaConfigurationError('Unknown partition type (%s) '
                                          '-- expected int' % type(partition))

        if topic not in self._client.topic_partitions:
            raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic)
        if partition not in self._client.get_partition_ids_for_topic(topic):
            raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s "
                                               "in broker metadata" % (partition, topic))
        logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition)
        self._topics.append((topic, partition))
Esempio n. 20
0
    def _consume_topic_partition(self, topic, partition):
        topic = kafka_bytestring(topic)
        if not isinstance(partition, int):
            raise KafkaConfigurationError('Unknown partition type (%s) '
                                          '-- expected int' % type(partition))

        if topic not in self._client.topic_partitions:
            raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic)
        if partition not in self._client.get_partition_ids_for_topic(topic):
            raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s "
                                               "in broker metadata" % (partition, topic))
        logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition)
        self._topics.append((topic, partition))
Esempio n. 21
0
    def __init__(self, hosts, client_id=CLIENT_ID, timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS, correlation_id=0):
        # We need one connection to bootstrap
        self.client_id = kafka_bytestring(client_id)
        self.timeout = timeout
        self.hosts = collect_hosts(hosts)
        self.correlation_id = correlation_id

        # create connections only when we need them
        self.conns = {}
        self.brokers = {}  # broker_id -> BrokerMetadata
        self.topics_to_brokers = {}  # TopicAndPartition -> BrokerMetadata
        self.topic_partitions = {}  # topic -> partition -> PartitionMetadata

        self.load_metadata_for_topics()  # bootstrap with all metadata
Esempio n. 22
0
    def __init__(self, hosts, client_id=CLIENT_ID,
                 timeout=DEFAULT_SOCKET_TIMEOUT_SECONDS,
                 correlation_id=0):
        # We need one connection to bootstrap
        self.client_id = kafka_bytestring(client_id)
        self.timeout = timeout
        self.hosts = collect_hosts(hosts)
        self.correlation_id = correlation_id

        # create connections only when we need them
        self.conns = {}
        self.brokers = {}            # broker_id -> BrokerMetadata
        self.topics_to_brokers = {}  # TopicAndPartition -> BrokerMetadata
        self.topic_partitions = {}   # topic -> partition -> PartitionMetadata

        self.load_metadata_for_topics()  # bootstrap with all metadata
Esempio n. 23
0
def get_kafka_ucr_pillow(pillow_id='kafka-ucr-main', ucr_division=None,
                         include_ucrs=None, exclude_ucrs=None, topics=None,
                         num_processes=1, process_num=0, **kwargs):
    topics = topics or KAFKA_TOPICS
    topics = [kafka_bytestring(t) for t in topics]
    return ConfigurableReportKafkaPillow(
        processor=ConfigurableReportPillowProcessor(
            data_source_provider=DynamicDataSourceProvider(),
            auto_repopulate_tables=False,
            ucr_division=ucr_division,
            include_ucrs=include_ucrs,
            exclude_ucrs=exclude_ucrs,
        ),
        pillow_name=pillow_id,
        topics=topics,
        num_processes=num_processes,
        process_num=process_num,
    )
Esempio n. 24
0
def validate_offsets(expected_offsets):
    """
    Takes in a dictionary of offsets (topics to checkpoint numbers) and ensures they are all available
    in the current kafka feed
    """
    if expected_offsets:
        topics = {kafka_bytestring(x[0]) for x in expected_offsets.keys()}
        available_offsets = get_multi_topic_first_available_offsets(topics)
        for topic_partition, offset in expected_offsets.items():
            topic, partition = topic_partition
            if topic_partition not in available_offsets:
                raise UnavailableKafkaOffset("Invalid partition '{}' for topic '{}'".format(partition, topic))

            if expected_offsets[topic_partition] < available_offsets[topic_partition]:
                message = (
                    'First available topic offset for {}:{} is {} but needed {}.'
                ).format(topic, partition, available_offsets[topic_partition], expected_offsets[topic_partition])
                raise UnavailableKafkaOffset(message)
Esempio n. 25
0
def get_kafka_ucr_static_pillow(pillow_id='kafka-ucr-static', ucr_division=None,
                                include_ucrs=None, exclude_ucrs=None, topics=None,
                                num_processes=1, process_num=0, **kwargs):
    topics = topics or KAFKA_TOPICS
    topics = [kafka_bytestring(t) for t in topics]
    return ConfigurableReportKafkaPillow(
        processor=ConfigurableReportPillowProcessor(
            data_source_provider=StaticDataSourceProvider(),
            auto_repopulate_tables=True,
            ucr_division=ucr_division,
            include_ucrs=include_ucrs,
            exclude_ucrs=exclude_ucrs,
            bootstrap_interval=7 * 24 * 60 * 60  # 1 week
        ),
        pillow_name=pillow_id,
        topics=topics,
        num_processes=num_processes,
        process_num=process_num,
        retry_errors=True
    )
Esempio n. 26
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp, ) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset
Esempio n. 27
0
    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp,) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset
Esempio n. 28
0
    def test_switch_leader_keyed_producer(self):
        topic = self.topic

        producer = KeyedProducer(self.client, async=False)

        # Send 10 random messages
        for _ in range(10):
            key = random_string(3).encode('utf-8')
            msg = random_string(10).encode('utf-8')
            producer.send_messages(topic, key, msg)

        # kill leader for partition 0
        self._kill_leader(topic, 0)

        recovered = False
        started = time.time()
        timeout = 60
        while not recovered and (time.time() - started) < timeout:
            try:
                key = random_string(3).encode('utf-8')
                msg = random_string(10).encode('utf-8')
                producer.send_messages(topic, key, msg)
                if producer.partitioners[kafka_bytestring(topic)].partition(
                        key) == 0:
                    recovered = True
            except (FailedPayloadsError, ConnectionError):
                log.debug("caught exception sending message -- will retry")
                continue

        # Verify we successfully sent the message
        self.assertTrue(recovered)

        # send some more messages just to make sure no more exceptions
        for _ in range(10):
            key = random_string(3).encode('utf-8')
            msg = random_string(10).encode('utf-8')
            producer.send_messages(topic, key, msg)
    def test_switch_leader_keyed_producer(self):
        topic = self.topic

        producer = KeyedProducer(self.client, async=False)

        # Send 10 random messages
        for _ in range(10):
            key = random_string(3)
            msg = random_string(10)
            producer.send_messages(topic, key, msg)

        # kill leader for partition 0
        self._kill_leader(topic, 0)

        recovered = False
        started = time.time()
        timeout = 60
        while not recovered and (time.time() - started) < timeout:
            try:
                key = random_string(3)
                msg = random_string(10)
                producer.send_messages(topic, key, msg)
                if producer.partitioners[kafka_bytestring(topic)].partition(key) == 0:
                    recovered = True
            except (FailedPayloadsError, ConnectionError):
                logging.debug("caught exception sending message -- will retry")
                continue

        # Verify we successfully sent the message
        self.assertTrue(recovered)

        # send some more messages just to make sure no more exceptions
        for _ in range(10):
            key = random_string(3)
            msg = random_string(10)
            producer.send_messages(topic, key, msg)
Esempio n. 30
0
def get_topics_watermarks(kafka_client, topics, raise_on_error=True):
    """ Get current topic watermarks.

    NOTE: This method does not refresh client metadata. It is up to the caller
    to use avoid using stale metadata.

    If any partition leader is not available, the request fails for all the
    other topics. This is the tradeoff of sending all topic requests in batch
    and save both in performance and Kafka load.

    :param kafka_client: a connected KafkaToolClient
    :param topics: topic list or dict {<topic>: [partitions]}
    :param raise_on_error: if False the method ignores missing topics
      and missing partitions. It still may fail on the request send.
    :returns: a dict topic: partition: Part
    :raises:
      :py:class:`~kafka_utils.util.error.UnknownTopic`: upon missing
      topics and raise_on_error=True

      :py:class:`~kafka_utils.util.error.UnknownPartition`: upon missing
      partitions and raise_on_error=True

      FailedPayloadsError: upon send request error.
    """
    topics = _verify_topics_and_partitions(
        kafka_client,
        topics,
        raise_on_error,
    )
    highmark_offset_reqs = []
    lowmark_offset_reqs = []

    for topic, partitions in topics.iteritems():
        # Batch watermark requests
        for partition in partitions:
            # Request the the latest offset
            highmark_offset_reqs.append(
                OffsetRequest(
                    kafka_bytestring(topic), partition, -1, max_offsets=1
                )
            )
            # Request the earliest offset
            lowmark_offset_reqs.append(
                OffsetRequest(
                    kafka_bytestring(topic), partition, -2, max_offsets=1
                )
            )

    watermark_offsets = {}

    if not (len(highmark_offset_reqs) + len(lowmark_offset_reqs)):
        return watermark_offsets

    # fail_on_error = False does not prevent network errors
    highmark_resps = kafka_client.send_offset_request(
        highmark_offset_reqs,
        fail_on_error=False,
        callback=_check_fetch_response_error,
    )
    lowmark_resps = kafka_client.send_offset_request(
        lowmark_offset_reqs,
        fail_on_error=False,
        callback=_check_fetch_response_error,
    )

    # At this point highmark and lowmark should ideally have the same length.
    assert len(highmark_resps) == len(lowmark_resps)
    aggregated_offsets = defaultdict(lambda: defaultdict(dict))
    for resp in highmark_resps:
        aggregated_offsets[resp.topic][resp.partition]['highmark'] = \
            resp.offsets[0]
    for resp in lowmark_resps:
        aggregated_offsets[resp.topic][resp.partition]['lowmark'] = \
            resp.offsets[0]

    for topic, partition_watermarks in aggregated_offsets.iteritems():
        for partition, watermarks in partition_watermarks.iteritems():
            watermark_offsets.setdefault(
                topic,
                {},
            )[partition] = PartitionOffsets(
                topic,
                partition,
                watermarks['highmark'],
                watermarks['lowmark'],
            )
    return watermark_offsets
Esempio n. 31
0
 def has_metadata_for_topic(self, topic):
     topic = kafka_bytestring(topic)
     return (topic in self.topic_partitions
             and len(self.topic_partitions[topic]) > 0)
Esempio n. 32
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = [
            FetchRequest(topic, partition,
                         self._offsets.fetch[(topic, partition)], max_bytes)
            for (topic, partition) in self._topics
        ]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False)

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, '
                    'offset %d (Highwatermark: %d)', topic, partition,
                    self._offsets.fetch[(topic, partition)],
                    resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition)))
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", topic,
                               partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug(
                        'message offset less than fetched offset '
                        'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Esempio n. 33
0
    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages'
            )

        fetches = [FetchRequest(topic, partition,
                                self._offsets.fetch[(topic, partition)],
                                max_bytes)
                   for (topic, partition) in self._topics]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False
        )

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning('OffsetOutOfRange: topic %s, partition %d, '
                               'offset %d (Highwatermark: %d)',
                               topic, partition,
                               self._offsets.fetch[(topic, partition)],
                               resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition))
                )
                continue

            except NotLeaderForPartitionError:
                logger.warning("NotLeaderForPartitionError for %s - %d. "
                               "Metadata may be out of date",
                               topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d",
                               topic, partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug('message offset less than fetched offset '
                                 'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg
Esempio n. 34
0
    def load_metadata_for_topics(self, *topics):
        """
        Fetch broker and topic-partition metadata from the server,
        and update internal data:
        broker list, topic/partition list, and topic/parition -> broker map

        This method should be called after receiving any error

        Arguments:
            *topics (optional): If a list of topics is provided,
                the metadata refresh will be limited to the specified topics only.

        Exceptions:
        ----------
        If the broker is configured to not auto-create topics,
        expect UnknownTopicOrPartitionError for topics that don't exist

        If the broker is configured to auto-create topics,
        expect LeaderNotAvailableError for new topics
        until partitions have been initialized.

        Exceptions *will not* be raised in a full refresh (i.e. no topic list)
        In this case, error codes will be logged as errors

        Partition-level errors will also not be raised here
        (a single partition w/o a leader, for example)
        """
        topics = [kafka_bytestring(t) for t in topics]

        if topics:
            for topic in topics:
                self.reset_topic_metadata(topic)
        else:
            self.reset_all_metadata()

        resp = self.send_metadata_request(topics)

        log.debug('Updating broker metadata: %s', resp.brokers)
        log.debug('Updating topic metadata: %s', resp.topics)

        self.brokers = dict([(broker.nodeId, broker)
                             for broker in resp.brokers])

        for topic_metadata in resp.topics:
            topic = topic_metadata.topic
            partitions = topic_metadata.partitions

            # Errors expected for new topics
            try:
                kafka.common.check_error(topic_metadata)
            except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e:

                # Raise if the topic was passed in explicitly
                if topic in topics:
                    raise

                # Otherwise, just log a warning
                log.error('Error loading topic metadata for %s: %s', topic, type(e))
                continue

            self.topic_partitions[topic] = {}
            for partition_metadata in partitions:
                partition = partition_metadata.partition
                leader = partition_metadata.leader

                self.topic_partitions[topic][partition] = partition_metadata

                # Populate topics_to_brokers dict
                topic_part = TopicAndPartition(topic, partition)

                # Check for partition errors
                try:
                    kafka.common.check_error(partition_metadata)

                # If No Leader, topics_to_brokers topic_partition -> None
                except LeaderNotAvailableError:
                    log.error('No leader for topic %s partition %d', topic, partition)
                    self.topics_to_brokers[topic_part] = None
                    continue
                # If one of the replicas is unavailable -- ignore
                # this error code is provided for admin purposes only
                # we never talk to replicas, only the leader
                except ReplicaNotAvailableError:
                    log.debug('Some (non-leader) replicas not available for topic %s partition %d', topic, partition)

                # If Known Broker, topic_partition -> BrokerMetadata
                if leader in self.brokers:
                    self.topics_to_brokers[topic_part] = self.brokers[leader]

                # If Unknown Broker, fake BrokerMetadata so we dont lose the id
                # (not sure how this could happen. server could be in bad state)
                else:
                    self.topics_to_brokers[topic_part] = BrokerMetadata(
                        leader, None, None
                    )
Esempio n. 35
0
 def has_metadata_for_topic(self, topic):
     topic = kafka_bytestring(topic)
     return (
       topic in self.topic_partitions
       and len(self.topic_partitions[topic]) > 0
     )
Esempio n. 36
0
 def __init__(self, group_id, cluster, **config):
     self.log = logging.getLogger(self.__class__.__name__)
     self._config = config
     self.cluster = cluster
     self.group_id = kafka_bytestring(group_id)
Esempio n. 37
0
 def current_offset(self, topic, partition):
     offsets, = self.client.send_offset_request(
         [OffsetRequest(kafka_bytestring(topic), partition, -1, 1)])
     return offsets.offsets[0]
Esempio n. 38
0
def set_consumer_offsets(
    kafka_client,
    group,
    new_offsets,
    raise_on_error=True,
    offset_storage='zookeeper',
):
    """Set consumer offsets to the specified offsets.

    This method does not validate the specified offsets, it is up to
    the caller to specify valid offsets within a topic partition.

    If any partition leader is not available, the request fails for all the
    other topics. This is the tradeoff of sending all topic requests in batch
    and save both in performance and Kafka load.

    :param kafka_client: a connected KafkaToolClient
    :param group: kafka group_id
    :param topics: dict {<topic>: {<partition>: <offset>}}
    :param raise_on_error: if False the method does not raise exceptions
      on errors encountered. It may still fail on the request send.
    :param offset_storage: String, one of {zookeeper, kafka}.
    :returns: a list of errors for each partition offset update that failed.
    :rtype: list [OffsetCommitError]
    :raises:
      :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing
      topics and raise_on_error=True

      :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing
      partitions and raise_on_error=True

      :py:class:`exceptions.TypeError`: upon badly formatted input
      new_offsets

      :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown
      offset_storage choice.

      FailedPayloadsError: upon send request error.
    """
    valid_new_offsets = _verify_commit_offsets_requests(
        kafka_client, new_offsets, raise_on_error)

    group_offset_reqs = [
        OffsetCommitRequest(kafka_bytestring(topic), partition, offset, None)
        for topic, new_partition_offsets in valid_new_offsets.iteritems()
        for partition, offset in new_partition_offsets.iteritems()
    ]

    if offset_storage == 'zookeeper' or not offset_storage:
        send_api = kafka_client.send_offset_commit_request
    elif offset_storage == 'kafka':
        send_api = kafka_client.send_offset_commit_request_kafka
    else:
        raise InvalidOffsetStorageError(offset_storage)

    status = []
    if group_offset_reqs:
        status = send_api(kafka_bytestring(group),
                          group_offset_reqs,
                          raise_on_error,
                          callback=_check_commit_response_error)

    return filter(None, status)
Esempio n. 39
0
 def current_offset(self, topic, partition):
     offsets, = self.client.send_offset_request([ OffsetRequest(kafka_bytestring(topic),
         partition, -1, 1) ])
     return offsets.offsets[0]
 def _kill_leader(self, topic, partition):
     leader = self.client.topics_to_brokers[TopicAndPartition(kafka_bytestring(topic), partition)]
     broker = self.brokers[leader.nodeId]
     broker.close()
     return broker
Esempio n. 41
0
def get_current_consumer_offsets(
    kafka_client,
    group,
    topics,
    raise_on_error=True,
    offset_storage='zookeeper',
):
    """ Get current consumer offsets.

    NOTE: This method does not refresh client metadata. It is up to the caller
    to avoid using stale metadata.

    If any partition leader is not available, the request fails for all the
    other topics. This is the tradeoff of sending all topic requests in batch
    and save both in performance and Kafka load.

    :param kafka_client: a connected KafkaToolClient
    :param group: kafka group_id
    :param topics: topic list or dict {<topic>: [partitions]}
    :param raise_on_error: if False the method ignores missing topics and
      missing partitions. It still may fail on the request send.
    :param offset_storage: String, one of {zookeeper, kafka}.
    :returns: a dict topic: partition: offset
    :raises:
      :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing
      topics and raise_on_error=True

      :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing
      partitions and raise_on_error=True

      :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown
      offset_storage choice.

      FailedPayloadsError: upon send request error.
    """

    topics = _verify_topics_and_partitions(kafka_client, topics, raise_on_error)

    group_offset_reqs = [
        OffsetFetchRequest(kafka_bytestring(topic), partition)
        for topic, partitions in topics.iteritems()
        for partition in partitions
    ]

    group_offsets = {}

    if offset_storage == 'zookeeper':
        send_api = kafka_client.send_offset_fetch_request
    elif offset_storage == 'kafka':
        send_api = kafka_client.send_offset_fetch_request_kafka
    else:
        raise InvalidOffsetStorageError(offset_storage)

    if group_offset_reqs:
        # fail_on_error = False does not prevent network errors
        group_resps = send_api(
            group=kafka_bytestring(group),
            payloads=group_offset_reqs,
            fail_on_error=False,
            callback=pluck_topic_offset_or_zero_on_unknown,
        )
        for resp in group_resps:
            group_offsets.setdefault(
                resp.topic,
                {},
            )[resp.partition] = resp.offset

    return group_offsets
Esempio n. 42
0
 def send(self, topic, key, msg):
     topic = kafka_bytestring(topic)
     partition = self._next_partition(topic, key)
     return self._send_messages(topic, partition, msg, key=key)
Esempio n. 43
0
 def _kill_leader(self, topic, partition):
     leader = self.client.topics_to_brokers[TopicAndPartition(
         kafka_bytestring(topic), partition)]
     broker = self.brokers[leader.nodeId]
     broker.close()
     return broker
Esempio n. 44
0
    def get_partition_ids_for_topic(self, topic):
        topic = kafka_bytestring(topic)
        if topic not in self.topic_partitions:
            return []

        return sorted(list(self.topic_partitions[topic]))
Esempio n. 45
0
def get_topics_watermarks(kafka_client, topics, raise_on_error=True):
    """ Get current topic watermarks.

    NOTE: This method does not refresh client metadata. It is up to the caller
    to use avoid using stale metadata.

    If any partition leader is not available, the request fails for all the
    other topics. This is the tradeoff of sending all topic requests in batch
    and save both in performance and Kafka load.

    :param kafka_client: a connected KafkaToolClient
    :param topics: topic list or dict {<topic>: [partitions]}
    :param raise_on_error: if False the method ignores missing topics
      and missing partitions. It still may fail on the request send.
    :returns: a dict topic: partition: Part
    :raises:
      :py:class:`~kafka_utils.util.error.UnknownTopic`: upon missing
      topics and raise_on_error=True

      :py:class:`~kafka_utils.util.error.UnknownPartition`: upon missing
      partitions and raise_on_error=True

      FailedPayloadsError: upon send request error.
    """
    topics = _verify_topics_and_partitions(
        kafka_client,
        topics,
        raise_on_error,
    )
    highmark_offset_reqs = []
    lowmark_offset_reqs = []

    for topic, partitions in topics.iteritems():
        # Batch watermark requests
        for partition in partitions:
            # Request the the latest offset
            highmark_offset_reqs.append(
                OffsetRequest(kafka_bytestring(topic),
                              partition,
                              -1,
                              max_offsets=1))
            # Request the earliest offset
            lowmark_offset_reqs.append(
                OffsetRequest(kafka_bytestring(topic),
                              partition,
                              -2,
                              max_offsets=1))

    watermark_offsets = {}

    if not (len(highmark_offset_reqs) + len(lowmark_offset_reqs)):
        return watermark_offsets

    # fail_on_error = False does not prevent network errors
    highmark_resps = kafka_client.send_offset_request(
        highmark_offset_reqs,
        fail_on_error=False,
        callback=_check_fetch_response_error,
    )
    lowmark_resps = kafka_client.send_offset_request(
        lowmark_offset_reqs,
        fail_on_error=False,
        callback=_check_fetch_response_error,
    )

    # At this point highmark and lowmark should ideally have the same length.
    assert len(highmark_resps) == len(lowmark_resps)
    aggregated_offsets = defaultdict(lambda: defaultdict(dict))
    for resp in highmark_resps:
        aggregated_offsets[resp.topic][resp.partition]['highmark'] = \
            resp.offsets[0]
    for resp in lowmark_resps:
        aggregated_offsets[resp.topic][resp.partition]['lowmark'] = \
            resp.offsets[0]

    for topic, partition_watermarks in aggregated_offsets.iteritems():
        for partition, watermarks in partition_watermarks.iteritems():
            watermark_offsets.setdefault(
                topic,
                {},
            )[partition] = PartitionOffsets(
                topic,
                partition,
                watermarks['highmark'],
                watermarks['lowmark'],
            )
    return watermark_offsets
Esempio n. 46
0
    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = kafka_bytestring(arg)

                for partition in self._client.get_partition_ids_for_topic(topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = kafka_bytestring(arg[0])
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = kafka_bytestring(key)

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)'
                            )

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = kafka_bytestring(key[0])
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[topic_partition] = self._offsets.commit[topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()
Esempio n. 47
0
 def send(self, topic, key, msg):
     topic = kafka_bytestring(topic)
     partition = self._next_partition(topic, key)
     return self._send_messages(topic, partition, msg, key=key)
Esempio n. 48
0
        raise TypeError("Can't get Kafka port from environment variable")
    try:
        env_kafka_port = int(env_kafka_port)
    except:
        raise TypeError("Couldn't turn Kafka port into a number")

    env_kafka_topic = os.environ.get('TOPIC')
    if env_kafka_topic is None:
        raise TypeError("Can't get Kafka topic from environment variable")

    env_consumer_group = os.environ.get('CONSUMER_GROUP')
    if env_consumer_group is None:
        raise TypeError("Can't get Kafka consumer group from environment variable")

    # Kafka settings
    consumer_group = kafka_bytestring(env_consumer_group)
    topic = kafka_bytestring(env_kafka_topic)
    
    kafka_host = "{0}:{1}".format(env_kafka_addr, env_kafka_port)

    # Create client
    print("Starting Kafka client: ", kafka_host, topic, consumer_group, flush=True)
    try:
        client = KafkaClient(kafka_host)
    except (LeaderNotAvailableError, ConnectionError, KafkaUnavailableError,) as leader_err:
        num_tries = 0
        max_tries = 10
        is_connected = False
        while not is_connected:
            print("Retrying to start consumer client: {0!s}".format(num_tries), flush=True)
            time.sleep(5)
Esempio n. 49
0
def get_current_consumer_offsets(
    kafka_client,
    group,
    topics,
    raise_on_error=True,
    offset_storage='zookeeper',
):
    """ Get current consumer offsets.

    NOTE: This method does not refresh client metadata. It is up to the caller
    to avoid using stale metadata.

    If any partition leader is not available, the request fails for all the
    other topics. This is the tradeoff of sending all topic requests in batch
    and save both in performance and Kafka load.

    :param kafka_client: a connected KafkaToolClient
    :param group: kafka group_id
    :param topics: topic list or dict {<topic>: [partitions]}
    :param raise_on_error: if False the method ignores missing topics and
      missing partitions. It still may fail on the request send.
    :param offset_storage: String, one of {zookeeper, kafka}.
    :returns: a dict topic: partition: offset
    :raises:
      :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing
      topics and raise_on_error=True

      :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing
      partitions and raise_on_error=True

      :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown
      offset_storage choice.

      FailedPayloadsError: upon send request error.
    """

    topics = _verify_topics_and_partitions(kafka_client, topics,
                                           raise_on_error)

    group_offset_reqs = [
        OffsetFetchRequest(kafka_bytestring(topic), partition)
        for topic, partitions in topics.iteritems() for partition in partitions
    ]

    group_offsets = {}

    if offset_storage == 'zookeeper':
        send_api = kafka_client.send_offset_fetch_request
    elif offset_storage == 'kafka':
        send_api = kafka_client.send_offset_fetch_request_kafka
    else:
        raise InvalidOffsetStorageError(offset_storage)

    if group_offset_reqs:
        # fail_on_error = False does not prevent network errors
        group_resps = send_api(
            group=kafka_bytestring(group),
            payloads=group_offset_reqs,
            fail_on_error=False,
            callback=pluck_topic_offset_or_zero_on_unknown,
        )
        for resp in group_resps:
            group_offsets.setdefault(
                resp.topic,
                {},
            )[resp.partition] = resp.offset

    return group_offsets
Esempio n. 50
0
    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = kafka_bytestring(arg)

                for partition in self._client.get_partition_ids_for_topic(
                        topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = kafka_bytestring(arg[0])
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = kafka_bytestring(key)

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)')

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = kafka_bytestring(key[0])
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' %
                                              type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[
                        topic_partition] = self._offsets.commit[
                            topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[
                        topic_partition] = self._reset_partition_offset(
                            topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()
Esempio n. 51
0
def set_consumer_offsets(
    kafka_client,
    group,
    new_offsets,
    raise_on_error=True,
    offset_storage='zookeeper',
):
    """Set consumer offsets to the specified offsets.

    This method does not validate the specified offsets, it is up to
    the caller to specify valid offsets within a topic partition.

    If any partition leader is not available, the request fails for all the
    other topics. This is the tradeoff of sending all topic requests in batch
    and save both in performance and Kafka load.

    :param kafka_client: a connected KafkaToolClient
    :param group: kafka group_id
    :param topics: dict {<topic>: {<partition>: <offset>}}
    :param raise_on_error: if False the method does not raise exceptions
      on errors encountered. It may still fail on the request send.
    :param offset_storage: String, one of {zookeeper, kafka}.
    :returns: a list of errors for each partition offset update that failed.
    :rtype: list [OffsetCommitError]
    :raises:
      :py:class:`kafka_utils.util.error.UnknownTopic`: upon missing
      topics and raise_on_error=True

      :py:class:`kafka_utils.util.error.UnknownPartition`: upon missing
      partitions and raise_on_error=True

      :py:class:`exceptions.TypeError`: upon badly formatted input
      new_offsets

      :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown
      offset_storage choice.

      FailedPayloadsError: upon send request error.
    """
    valid_new_offsets = _verify_commit_offsets_requests(
        kafka_client,
        new_offsets,
        raise_on_error
    )

    group_offset_reqs = [
        OffsetCommitRequest(
            kafka_bytestring(topic),
            partition,
            offset,
            None
        )
        for topic, new_partition_offsets in valid_new_offsets.iteritems()
        for partition, offset in new_partition_offsets.iteritems()
    ]

    if offset_storage == 'zookeeper' or not offset_storage:
        send_api = kafka_client.send_offset_commit_request
    elif offset_storage == 'kafka':
        send_api = kafka_client.send_offset_commit_request_kafka
    else:
        raise InvalidOffsetStorageError(offset_storage)

    status = []
    if group_offset_reqs:
        status = send_api(
            kafka_bytestring(group),
            group_offset_reqs,
            raise_on_error,
            callback=_check_commit_response_error
        )

    return filter(None, status)
Esempio n. 52
0
    def load_metadata_for_topics(self, *topics):
        """
        Fetch broker and topic-partition metadata from the server,
        and update internal data:
        broker list, topic/partition list, and topic/parition -> broker map

        This method should be called after receiving any error

        Arguments:
            *topics (optional): If a list of topics is provided,
                the metadata refresh will be limited to the specified topics only.

        Exceptions:
        ----------
        If the broker is configured to not auto-create topics,
        expect UnknownTopicOrPartitionError for topics that don't exist

        If the broker is configured to auto-create topics,
        expect LeaderNotAvailableError for new topics
        until partitions have been initialized.

        Exceptions *will not* be raised in a full refresh (i.e. no topic list)
        In this case, error codes will be logged as errors

        Partition-level errors will also not be raised here
        (a single partition w/o a leader, for example)
        """
        topics = [kafka_bytestring(t) for t in topics]

        if topics:
            for topic in topics:
                self.reset_topic_metadata(topic)
        else:
            self.reset_all_metadata()

        resp = self.send_metadata_request(topics)

        log.debug('Updating broker metadata: %s', resp.brokers)
        log.debug('Updating topic metadata: %s', resp.topics)

        self.brokers = dict([(broker.nodeId, broker)
                             for broker in resp.brokers])

        for topic_metadata in resp.topics:
            topic = topic_metadata.topic
            partitions = topic_metadata.partitions

            # Errors expected for new topics
            try:
                kafka.common.check_error(topic_metadata)
            except (UnknownTopicOrPartitionError,
                    LeaderNotAvailableError) as e:

                # Raise if the topic was passed in explicitly
                if topic in topics:
                    raise

                # Otherwise, just log a warning
                log.error('Error loading topic metadata for %s: %s', topic,
                          type(e))
                continue

            self.topic_partitions[topic] = {}
            for partition_metadata in partitions:
                partition = partition_metadata.partition
                leader = partition_metadata.leader

                self.topic_partitions[topic][partition] = partition_metadata

                # Populate topics_to_brokers dict
                topic_part = TopicAndPartition(topic, partition)

                # Check for partition errors
                try:
                    kafka.common.check_error(partition_metadata)

                # If No Leader, topics_to_brokers topic_partition -> None
                except LeaderNotAvailableError:
                    log.error('No leader for topic %s partition %d', topic,
                              partition)
                    self.topics_to_brokers[topic_part] = None
                    continue
                # If one of the replicas is unavailable -- ignore
                # this error code is provided for admin purposes only
                # we never talk to replicas, only the leader
                except ReplicaNotAvailableError:
                    log.debug(
                        'Some (non-leader) replicas not available for topic %s partition %d',
                        topic, partition)

                # If Known Broker, topic_partition -> BrokerMetadata
                if leader in self.brokers:
                    self.topics_to_brokers[topic_part] = self.brokers[leader]

                # If Unknown Broker, fake BrokerMetadata so we dont lose the id
                # (not sure how this could happen. server could be in bad state)
                else:
                    self.topics_to_brokers[topic_part] = BrokerMetadata(
                        leader, None, None)
Esempio n. 53
0
    def get_partition_ids_for_topic(self, topic):
        topic = kafka_bytestring(topic)
        if topic not in self.topic_partitions:
            return []

        return sorted(list(self.topic_partitions[topic]))