Esempio n. 1
0
 def test_get_topics_watermarks_invalid_partition_subset(
         self, kafka_client_mock):
     with pytest.raises(UnknownPartitions):
         get_topics_watermarks(
             kafka_client_mock,
             {'topic1': [1, 99]},
         )
Esempio n. 2
0
 def test_get_topics_watermarks_unknown_topic_no_fail(self, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         ["something that doesn't exist"],
         raise_on_error=False,
     )
     assert not actual
Esempio n. 3
0
 def test_get_topics_watermarks_unknown_partitions_no_fail(self, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         {'topic1': [99]},
         raise_on_error=False,
     )
     assert not actual
Esempio n. 4
0
def get_watermark_for_regex(
    kafka_client,
    topic_regex,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param topic: the topic regex
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    topics_to_be_considered = []

    for topic in kafka_client.topic_partitions:
        if re.search(topic_regex, topic):
            topics_to_be_considered.append(topic)

    watermarks = get_topics_watermarks(
        kafka_client, topics_to_be_considered
    )
    return watermarks
Esempio n. 5
0
 def _verify_offset_ranges(self):
     """This is to clarify and enforce only using offsets inside of our actual offset range to avoid
     confusing errors such as those found in DATAPIPE-628"""
     topic_to_partition_offset_map = {
         topic: None if consumer_topic_state is None
         else consumer_topic_state.partition_offset_map
         for topic, consumer_topic_state in self.topic_to_offsets_map.items()
     }
     # If we import get_topics_watermarks directly from offsets, then mock will not properly patch it in testing.
     watermarks = offsets.get_topics_watermarks(
         self.kafka_client,
         topic_to_partition_offset_map,
         # We do not raise on error as we do this verification later on and we
         # want to keep the error message clear
         raise_on_error=False
     )
     for topic, partition_offset_map in topic_to_partition_offset_map.iteritems():
         if partition_offset_map is not None:
             for partition, offset in partition_offset_map.iteritems():
                 highmark = watermarks[topic][partition].highmark
                 lowmark = watermarks[topic][partition].lowmark
                 if offset < lowmark or offset > highmark:
                     self.option_parser.error(
                         "Offset ({}) for topic: {} (partition: {}) is out of range ({}-{})".format(
                             offset,
                             topic,
                             partition,
                             lowmark,
                             highmark
                         )
                     )
Esempio n. 6
0
def get_watermark_for_regex(
    kafka_client,
    topic_regex,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param topic: the topic regex
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    topics_to_be_considered = []

    for topic in kafka_client.topic_partitions:
        if re.search(topic_regex, topic):
            topics_to_be_considered.append(topic)

    watermarks = get_topics_watermarks(kafka_client, topics_to_be_considered)
    return watermarks
Esempio n. 7
0
def get_watermark_for_regex(
    kafka_client,
    topic_regex,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param topic: the topic regex
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    :raises:
      :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown
      offset_storage choice.
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    topics_to_be_considered = []
    topic_regex = re.compile(topic_regex)

    for topic in kafka_client.topic_partitions:
        if topic_regex.match(topic):
            topics_to_be_considered.append(topic)

    watermarks = get_topics_watermarks(kafka_client, topics_to_be_considered)
    return watermarks
Esempio n. 8
0
 def _all_topic_watermarks(self):
     topics = self._kafka_topics
     with self._kafka_client() as kafka_client:
         return offsets.get_topics_watermarks(
             kafka_client,
             topics
         )
Esempio n. 9
0
 def test_get_topics_watermarks_unknown_partitions_no_fail(self, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         {'topic1': [99]},
         raise_on_error=False,
     )
     assert not actual
Esempio n. 10
0
 def test_get_topics_watermarks_unknown_topic_no_fail(self, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         ["something that doesn't exist"],
         raise_on_error=False,
     )
     assert not actual
Esempio n. 11
0
 def test_get_topics_watermarks_invalid_partition_subset_no_fail(self, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         {'topic1': [1, 99]},
         raise_on_error=False,
     )
     assert actual['topic1'][1] == PartitionOffsets('topic1', 1, 30, 5)
     assert 99 not in actual['topic1']
Esempio n. 12
0
 def test_get_topics_watermarks_invalid_partition_subset_no_fail(self, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         {'topic1': [1, 99]},
         raise_on_error=False,
     )
     assert actual['topic1'][1] == PartitionOffsets('topic1', 1, 30, 5)
     assert 99 not in actual['topic1']
Esempio n. 13
0
 def test_get_topics_watermarks_commit_error(self, topics, kafka_client_mock):
     kafka_client_mock.set_offset_request_error()
     actual = get_topics_watermarks(
         kafka_client_mock,
         {'topic1': [0]},
     )
     assert actual == {'topic1': {
         0: PartitionOffsets('topic1', 0, -1, -1),
     }}
Esempio n. 14
0
 def get_current_watermarks(self):
     self.consumer._client.load_metadata_for_topics()
     offsets = get_topics_watermarks(
         self.consumer._client,
         [CONSUMER_OFFSET_TOPIC],
     )
     return {partition: offset for partition, offset
             in offsets[CONSUMER_OFFSET_TOPIC].iteritems()
             if offset.highmark > offset.lowmark}
Esempio n. 15
0
 def test_get_topics_watermarks_commit_error(self, topics, kafka_client_mock):
     kafka_client_mock.set_offset_request_error()
     actual = get_topics_watermarks(
         kafka_client_mock,
         {'topic1': [0]},
     )
     assert actual == {'topic1': {
         0: PartitionOffsets('topic1', 0, -1, -1),
     }}
Esempio n. 16
0
 def test_get_topics_watermarks(self, topics, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         topics,
     )
     assert actual == {'topic1': {
         0: PartitionOffsets('topic1', 0, 30, 10),
         1: PartitionOffsets('topic1', 1, 30, 5),
         2: PartitionOffsets('topic1', 2, 30, 3),
     }}
Esempio n. 17
0
 def test_get_topics_watermarks(self, topics, kafka_client_mock):
     actual = get_topics_watermarks(
         kafka_client_mock,
         topics,
     )
     assert actual == {'topic1': {
         0: PartitionOffsets('topic1', 0, 30, 10),
         1: PartitionOffsets('topic1', 1, 30, 5),
         2: PartitionOffsets('topic1', 2, 30, 3),
     }}
Esempio n. 18
0
 def get_current_watermarks(self, partitions=None):
     client = KafkaToolClient(self.kafka_config.broker_list)
     client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
     offsets = get_topics_watermarks(
         client,
         [CONSUMER_OFFSET_TOPIC],
     )
     partitions_set = set(tp.partition for tp in partitions) if partitions else None
     return {part: offset for part, offset
             in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
             if offset.highmark > offset.lowmark and
             (partitions is None or part in partitions_set)}
Esempio n. 19
0
 def get_current_watermarks(self, partitions=None):
     client = KafkaToolClient(self.kafka_config.broker_list)
     client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
     offsets = get_topics_watermarks(
         client,
         [CONSUMER_OFFSET_TOPIC],
     )
     partitions_set = set(tp.partition for tp in partitions) if partitions else None
     return {part: offset for part, offset
             in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
             if offset.highmark > offset.lowmark and
             (partitions is None or part in partitions_set)}
Esempio n. 20
0
 def get_current_watermarks(self):
     self.consumer._client.load_metadata_for_topics()
     offsets = get_topics_watermarks(
         self.consumer._client,
         [CONSUMER_OFFSET_TOPIC],
     )
     return {
         partition: offset
         for partition, offset in
         offsets[CONSUMER_OFFSET_TOPIC].iteritems()
         if offset.highmark > offset.lowmark
     }
Esempio n. 21
0
def get_consumer_offsets_metadata(
    kafka_client,
    group,
    topics,
    raise_on_error=True,
    offset_storage='kafka',
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches group offsets
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param group: group id
    :param topics: list of topics
    :param raise_on_error: if False the method ignores missing topics and
      missing partitions. It still may fail on the request send.
    :param offset_storage: String, one of {zookeeper, kafka, dual}.
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    :raises:
      :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown
      offset_storage choice.
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    group_offsets = get_current_offsets(kafka_client, group, topics,
                                        raise_on_error, offset_storage)

    watermarks = get_topics_watermarks(kafka_client, topics, raise_on_error)

    result = {}
    for topic, partitions in group_offsets.iteritems():
        result[topic] = [
            ConsumerPartitionOffsets(
                topic=topic,
                partition=partition,
                current=group_offsets[topic][partition],
                highmark=watermarks[topic][partition].highmark,
                lowmark=watermarks[topic][partition].lowmark,
            ) for partition in partitions
        ]
    return result
Esempio n. 22
0
    def _verify_position_and_highwatermarks(self, topics, producer,
                                            message_count):
        topics_details = get_topics_watermarks(
            kafka_client=producer._kafka_producer.kafka_client,
            topics=topics,
            raise_on_error=True)
        position_tracker = producer._kafka_producer.position_data_tracker

        for topic in topics:
            actual_hwm = topics_details[topic][0][2]
            expected_hwm = position_tracker.topic_to_kafka_offset_map[topic]
            assert actual_hwm == expected_hwm

        position_info = producer.get_checkpoint_position_data()
        last_position = position_info.last_published_message_position_info
        assert last_position['position'] == message_count
Esempio n. 23
0
    def _populate_topics_to_offset_map(self, responses):
        topics_from_responses = [
            response.topic for response in responses
            if isinstance(response, ProduceResponse)
        ]

        topics_watermarks = get_topics_watermarks(
            kafka_client=self.kafka_client,
            topics=topics_from_responses,
            raise_on_error=True
        )
        topics_watermarks = {
            topic: partition_offsets[0].highmark
            for topic, partition_offsets in topics_watermarks.iteritems()
        }
        return topics_watermarks
Esempio n. 24
0
def get_consumer_offsets_metadata(
    kafka_client,
    group,
    topics,
    raise_on_error=True,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches group offsets
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param group: group id
    :param topics: list of topics
    :param raise_on_error: if False the method ignores missing topics and
      missing partitions. It still may fail on the request send.
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    group_offsets = get_current_consumer_offsets(
        kafka_client, group, topics, raise_on_error
    )

    watermarks = get_topics_watermarks(
        kafka_client, topics, raise_on_error
    )

    result = {}
    for topic, partitions in six.iteritems(group_offsets):
        result[topic] = [
            ConsumerPartitionOffsets(
                topic=topic,
                partition=partition,
                current=group_offsets[topic][partition],
                highmark=watermarks[topic][partition].highmark,
                lowmark=watermarks[topic][partition].lowmark,
            ) for partition in partitions
        ]
    return result
Esempio n. 25
0
def get_consumer_offsets_metadata(
    kafka_client,
    group,
    topics,
    raise_on_error=True,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches group offsets
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param group: group id
    :param topics: list of topics
    :param raise_on_error: if False the method ignores missing topics and
      missing partitions. It still may fail on the request send.
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    group_offsets = get_current_consumer_offsets(kafka_client, group, topics,
                                                 raise_on_error)

    watermarks = get_topics_watermarks(kafka_client, topics, raise_on_error)

    result = {}
    for topic, partitions in six.iteritems(group_offsets):
        result[topic] = [
            ConsumerPartitionOffsets(
                topic=topic,
                partition=partition,
                current=group_offsets[topic][partition],
                highmark=watermarks[topic][partition].highmark,
                lowmark=watermarks[topic][partition].lowmark,
            ) for partition in partitions
        ]
    return result
Esempio n. 26
0
def get_first_offset_at_or_after_start_timestamp(kafka_client, topics, start_timestamp):
    """Uses binary search to find the first offset that comes after start_timestamp for each
    topic in topics. If multiple items are present for a timestamp, the first one (closer to
    low_mark) is returned back.

    Outputs a result_topic_to_consumer_topic_state_map which can be used to set offsets

    :param kafka_client: kafka client to be used for getting watermarks and binary search.
    :param topics: a list of topics. eg. ['test_topic_1', 'test_topic_2']
    :param start_timestamp: epoch timestamp eg. 1463086536

    :returns: a dict mapping topic to the nearest starting timestamp.
              eg.
              {'test_topic_1': ConsumerTopicState({0: 43}, None),
              'test_topic_2': ConsumerTopicState({0: 55, 1: 32}, None)}
    """
    watermarks = offsets.get_topics_watermarks(
        kafka_client,
        topics,
        raise_on_error=False
    )

    topic_to_consumer_topic_state_map = _build_topic_to_consumer_topic_state_map(watermarks)
    topic_to_range_map = _build_topic_to_range_map(watermarks)
    result_topic_to_consumer_topic_state_map = _build_empty_topic_to_consumer_topic_state_map(topics)

    _move_finished_topics_to_result_map(
        topic_to_consumer_topic_state_map,
        topic_to_range_map,
        result_topic_to_consumer_topic_state_map
    )
    while topic_to_consumer_topic_state_map:
        _get_message_and_alter_range(
            start_timestamp,
            topic_to_consumer_topic_state_map,
            topic_to_range_map,
            result_topic_to_consumer_topic_state_map
        )

    logger.info(
        "Got topic offsets based on start-date: {}".format(result_topic_to_consumer_topic_state_map)
    )
    return result_topic_to_consumer_topic_state_map
Esempio n. 27
0
def get_actual_published_messages_count(
    kafka_client,
    topics,
    topic_tracked_offset_map,
    raise_on_error=True,
):
    """Get the actual number of published messages of specified topics.

    Args:
        kafka_client (kafka.client.KafkaClient): kafka client
        topics ([str]): List of topic names to get message count
        topic_tracked_offset_map (dict(str, int)): dictionary which
            contains each topic and its current stored offset value.
        raise_on_error (Optional[bool]): if False,  the function ignores
            missing topics and missing partitions. It still may fail on
            the request send.  Default to True.

    Returns:
        dict(str, int): Each topic and its actual published messages count
            since last offset.  If a topic or partition is missing when
            `raise_on_error` is False, the returned dict will not contain
            the missing topic.

    Raises:
        :class:`~yelp_kafka.error.UnknownTopic`: upon missing topics and
            raise_on_error=True
        :class:`~yelp_kafka.error.UnknownPartition`: upon missing partitions
        and raise_on_error=True
        FailedPayloadsError: upon send request error.
    """
    topic_watermarks = get_topics_watermarks(
        kafka_client,
        topics,
        raise_on_error=raise_on_error
    )

    topic_to_published_msgs_count = {}
    for topic, partition_offsets in topic_watermarks.iteritems():
        high_watermark = partition_offsets[0].highmark
        offset = topic_tracked_offset_map.get(topic, 0)
        topic_to_published_msgs_count[topic] = high_watermark - offset

    return topic_to_published_msgs_count
Esempio n. 28
0
def get_actual_published_messages_count(
    kafka_client,
    topics,
    topic_tracked_offset_map,
    raise_on_error=True,
):
    """Get the actual number of published messages of specified topics.

    Args:
        kafka_client (kafka.client.KafkaClient): kafka client
        topics ([str]): List of topic names to get message count
        topic_tracked_offset_map (dict(str, int)): dictionary which
            contains each topic and its current stored offset value.
        raise_on_error (Optional[bool]): if False,  the function ignores
            missing topics and missing partitions. It still may fail on
            the request send.  Default to True.

    Returns:
        dict(str, int): Each topic and its actual published messages count
            since last offset.  If a topic or partition is missing when
            `raise_on_error` is False, the returned dict will not contain
            the missing topic.

    Raises:
        :class:`~yelp_kafka.error.UnknownTopic`: upon missing topics and
            raise_on_error=True
        :class:`~yelp_kafka.error.UnknownPartition`: upon missing partitions
        and raise_on_error=True
        FailedPayloadsError: upon send request error.
    """
    topic_watermarks = get_topics_watermarks(kafka_client,
                                             topics,
                                             raise_on_error=raise_on_error)

    topic_to_published_msgs_count = {}
    for topic, partition_offsets in topic_watermarks.iteritems():
        high_watermark = partition_offsets[0].highmark
        offset = topic_tracked_offset_map.get(topic, 0)
        topic_to_published_msgs_count[topic] = high_watermark - offset

    return topic_to_published_msgs_count
Esempio n. 29
0
    def _verify_position_and_highwatermarks(
        self,
        topics,
        producer,
        message_count
    ):
        topics_details = get_topics_watermarks(
            kafka_client=producer._kafka_producer.kafka_client,
            topics=topics,
            raise_on_error=True
        )
        position_tracker = producer._kafka_producer.position_data_tracker

        for topic in topics:
            actual_hwm = topics_details[topic][0][2]
            expected_hwm = position_tracker.topic_to_kafka_offset_map[topic]
            assert actual_hwm == expected_hwm

        position_info = producer.get_checkpoint_position_data()
        last_position = position_info.last_published_message_position_info
        assert last_position['position'] == message_count
Esempio n. 30
0
def get_watermark_for_topic(
    kafka_client,
    topic,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param topic: the topic
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    watermarks = get_topics_watermarks(kafka_client, [topic])
    return watermarks
Esempio n. 31
0
def get_watermark_for_topic(
    kafka_client,
    topic,
):
    """This method:
        * refreshes metadata for the kafka client
        * fetches watermarks

    :param kafka_client: KafkaToolClient instance
    :param topic: the topic
    :returns: dict <topic>: [ConsumerPartitionOffsets]
    """
    # Refresh client metadata. We do not use the topic list, because we
    # don't want to accidentally create the topic if it does not exist.
    # If Kafka is unavailable, let's retry loading client metadata
    try:
        kafka_client.load_metadata_for_topics()
    except KafkaUnavailableError:
        kafka_client.load_metadata_for_topics()

    watermarks = get_topics_watermarks(
        kafka_client, [topic]
    )
    return watermarks
Esempio n. 32
0
 def test_get_topics_watermarks_unknown_topic(self, kafka_client_mock):
     with pytest.raises(UnknownTopic):
         get_topics_watermarks(
             kafka_client_mock,
             ["something that doesn't exist"],
         )
Esempio n. 33
0
 def test_get_topics_watermarks_unknown_partitions(self, kafka_client_mock):
     with pytest.raises(UnknownPartitions):
         get_topics_watermarks(
             kafka_client_mock,
             {'topic1': [99]},
         )
Esempio n. 34
0
 def test_get_topics_watermarks_unknown_partitions(self, kafka_client_mock):
     with pytest.raises(UnknownPartitions):
         get_topics_watermarks(
             kafka_client_mock,
             {'topic1': [99]},
         )
Esempio n. 35
0
 def test_get_topics_watermarks_invalid_arguments(self, kafka_client_mock):
     with pytest.raises(TypeError):
         get_topics_watermarks(
             kafka_client_mock,
             "this should be a list or dict",
         )
Esempio n. 36
0
 def test_get_topics_watermarks_invalid_arguments(self, kafka_client_mock):
     with pytest.raises(TypeError):
         get_topics_watermarks(
             kafka_client_mock,
             "this should be a list or dict",
         )
Esempio n. 37
0
 def test_get_topics_watermarks_unknown_topic(self, kafka_client_mock):
     with pytest.raises(UnknownTopic):
         get_topics_watermarks(
             kafka_client_mock,
             ["something that doesn't exist"],
         )
Esempio n. 38
0
 def _all_topic_watermarks(self):
     topics = self._kafka_topics
     with self._kafka_client() as kafka_client:
         return offsets.get_topics_watermarks(kafka_client, topics)
Esempio n. 39
0
 def test_get_topics_watermarks_invalid_partition_subset(self, kafka_client_mock):
     with pytest.raises(UnknownPartitions):
         get_topics_watermarks(
             kafka_client_mock,
             {'topic1': [1, 99]},
         )