def test_get_topics_watermarks_invalid_partition_subset( self, kafka_client_mock): with pytest.raises(UnknownPartitions): get_topics_watermarks( kafka_client_mock, {'topic1': [1, 99]}, )
def test_get_topics_watermarks_unknown_topic_no_fail(self, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, ["something that doesn't exist"], raise_on_error=False, ) assert not actual
def test_get_topics_watermarks_unknown_partitions_no_fail(self, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, {'topic1': [99]}, raise_on_error=False, ) assert not actual
def get_watermark_for_regex( kafka_client, topic_regex, ): """This method: * refreshes metadata for the kafka client * fetches watermarks :param kafka_client: KafkaToolClient instance :param topic: the topic regex :returns: dict <topic>: [ConsumerPartitionOffsets] """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() topics_to_be_considered = [] for topic in kafka_client.topic_partitions: if re.search(topic_regex, topic): topics_to_be_considered.append(topic) watermarks = get_topics_watermarks( kafka_client, topics_to_be_considered ) return watermarks
def _verify_offset_ranges(self): """This is to clarify and enforce only using offsets inside of our actual offset range to avoid confusing errors such as those found in DATAPIPE-628""" topic_to_partition_offset_map = { topic: None if consumer_topic_state is None else consumer_topic_state.partition_offset_map for topic, consumer_topic_state in self.topic_to_offsets_map.items() } # If we import get_topics_watermarks directly from offsets, then mock will not properly patch it in testing. watermarks = offsets.get_topics_watermarks( self.kafka_client, topic_to_partition_offset_map, # We do not raise on error as we do this verification later on and we # want to keep the error message clear raise_on_error=False ) for topic, partition_offset_map in topic_to_partition_offset_map.iteritems(): if partition_offset_map is not None: for partition, offset in partition_offset_map.iteritems(): highmark = watermarks[topic][partition].highmark lowmark = watermarks[topic][partition].lowmark if offset < lowmark or offset > highmark: self.option_parser.error( "Offset ({}) for topic: {} (partition: {}) is out of range ({}-{})".format( offset, topic, partition, lowmark, highmark ) )
def get_watermark_for_regex( kafka_client, topic_regex, ): """This method: * refreshes metadata for the kafka client * fetches watermarks :param kafka_client: KafkaToolClient instance :param topic: the topic regex :returns: dict <topic>: [ConsumerPartitionOffsets] """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() topics_to_be_considered = [] for topic in kafka_client.topic_partitions: if re.search(topic_regex, topic): topics_to_be_considered.append(topic) watermarks = get_topics_watermarks(kafka_client, topics_to_be_considered) return watermarks
def get_watermark_for_regex( kafka_client, topic_regex, ): """This method: * refreshes metadata for the kafka client * fetches watermarks :param kafka_client: KafkaToolClient instance :param topic: the topic regex :returns: dict <topic>: [ConsumerPartitionOffsets] :raises: :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown offset_storage choice. """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() topics_to_be_considered = [] topic_regex = re.compile(topic_regex) for topic in kafka_client.topic_partitions: if topic_regex.match(topic): topics_to_be_considered.append(topic) watermarks = get_topics_watermarks(kafka_client, topics_to_be_considered) return watermarks
def _all_topic_watermarks(self): topics = self._kafka_topics with self._kafka_client() as kafka_client: return offsets.get_topics_watermarks( kafka_client, topics )
def test_get_topics_watermarks_unknown_partitions_no_fail(self, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, {'topic1': [99]}, raise_on_error=False, ) assert not actual
def test_get_topics_watermarks_unknown_topic_no_fail(self, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, ["something that doesn't exist"], raise_on_error=False, ) assert not actual
def test_get_topics_watermarks_invalid_partition_subset_no_fail(self, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, {'topic1': [1, 99]}, raise_on_error=False, ) assert actual['topic1'][1] == PartitionOffsets('topic1', 1, 30, 5) assert 99 not in actual['topic1']
def test_get_topics_watermarks_invalid_partition_subset_no_fail(self, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, {'topic1': [1, 99]}, raise_on_error=False, ) assert actual['topic1'][1] == PartitionOffsets('topic1', 1, 30, 5) assert 99 not in actual['topic1']
def test_get_topics_watermarks_commit_error(self, topics, kafka_client_mock): kafka_client_mock.set_offset_request_error() actual = get_topics_watermarks( kafka_client_mock, {'topic1': [0]}, ) assert actual == {'topic1': { 0: PartitionOffsets('topic1', 0, -1, -1), }}
def get_current_watermarks(self): self.consumer._client.load_metadata_for_topics() offsets = get_topics_watermarks( self.consumer._client, [CONSUMER_OFFSET_TOPIC], ) return {partition: offset for partition, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems() if offset.highmark > offset.lowmark}
def test_get_topics_watermarks_commit_error(self, topics, kafka_client_mock): kafka_client_mock.set_offset_request_error() actual = get_topics_watermarks( kafka_client_mock, {'topic1': [0]}, ) assert actual == {'topic1': { 0: PartitionOffsets('topic1', 0, -1, -1), }}
def test_get_topics_watermarks(self, topics, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, topics, ) assert actual == {'topic1': { 0: PartitionOffsets('topic1', 0, 30, 10), 1: PartitionOffsets('topic1', 1, 30, 5), 2: PartitionOffsets('topic1', 2, 30, 3), }}
def test_get_topics_watermarks(self, topics, kafka_client_mock): actual = get_topics_watermarks( kafka_client_mock, topics, ) assert actual == {'topic1': { 0: PartitionOffsets('topic1', 0, 30, 10), 1: PartitionOffsets('topic1', 1, 30, 5), 2: PartitionOffsets('topic1', 2, 30, 3), }}
def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return {part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and (partitions is None or part in partitions_set)}
def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return {part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and (partitions is None or part in partitions_set)}
def get_current_watermarks(self): self.consumer._client.load_metadata_for_topics() offsets = get_topics_watermarks( self.consumer._client, [CONSUMER_OFFSET_TOPIC], ) return { partition: offset for partition, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems() if offset.highmark > offset.lowmark }
def get_consumer_offsets_metadata( kafka_client, group, topics, raise_on_error=True, offset_storage='kafka', ): """This method: * refreshes metadata for the kafka client * fetches group offsets * fetches watermarks :param kafka_client: KafkaToolClient instance :param group: group id :param topics: list of topics :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :param offset_storage: String, one of {zookeeper, kafka, dual}. :returns: dict <topic>: [ConsumerPartitionOffsets] :raises: :py:class:`kafka_utils.util.error.InvalidOffsetStorageError: upon unknown offset_storage choice. """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() group_offsets = get_current_offsets(kafka_client, group, topics, raise_on_error, offset_storage) watermarks = get_topics_watermarks(kafka_client, topics, raise_on_error) result = {} for topic, partitions in group_offsets.iteritems(): result[topic] = [ ConsumerPartitionOffsets( topic=topic, partition=partition, current=group_offsets[topic][partition], highmark=watermarks[topic][partition].highmark, lowmark=watermarks[topic][partition].lowmark, ) for partition in partitions ] return result
def _verify_position_and_highwatermarks(self, topics, producer, message_count): topics_details = get_topics_watermarks( kafka_client=producer._kafka_producer.kafka_client, topics=topics, raise_on_error=True) position_tracker = producer._kafka_producer.position_data_tracker for topic in topics: actual_hwm = topics_details[topic][0][2] expected_hwm = position_tracker.topic_to_kafka_offset_map[topic] assert actual_hwm == expected_hwm position_info = producer.get_checkpoint_position_data() last_position = position_info.last_published_message_position_info assert last_position['position'] == message_count
def _populate_topics_to_offset_map(self, responses): topics_from_responses = [ response.topic for response in responses if isinstance(response, ProduceResponse) ] topics_watermarks = get_topics_watermarks( kafka_client=self.kafka_client, topics=topics_from_responses, raise_on_error=True ) topics_watermarks = { topic: partition_offsets[0].highmark for topic, partition_offsets in topics_watermarks.iteritems() } return topics_watermarks
def get_consumer_offsets_metadata( kafka_client, group, topics, raise_on_error=True, ): """This method: * refreshes metadata for the kafka client * fetches group offsets * fetches watermarks :param kafka_client: KafkaToolClient instance :param group: group id :param topics: list of topics :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :returns: dict <topic>: [ConsumerPartitionOffsets] """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() group_offsets = get_current_consumer_offsets( kafka_client, group, topics, raise_on_error ) watermarks = get_topics_watermarks( kafka_client, topics, raise_on_error ) result = {} for topic, partitions in six.iteritems(group_offsets): result[topic] = [ ConsumerPartitionOffsets( topic=topic, partition=partition, current=group_offsets[topic][partition], highmark=watermarks[topic][partition].highmark, lowmark=watermarks[topic][partition].lowmark, ) for partition in partitions ] return result
def get_consumer_offsets_metadata( kafka_client, group, topics, raise_on_error=True, ): """This method: * refreshes metadata for the kafka client * fetches group offsets * fetches watermarks :param kafka_client: KafkaToolClient instance :param group: group id :param topics: list of topics :param raise_on_error: if False the method ignores missing topics and missing partitions. It still may fail on the request send. :returns: dict <topic>: [ConsumerPartitionOffsets] """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() group_offsets = get_current_consumer_offsets(kafka_client, group, topics, raise_on_error) watermarks = get_topics_watermarks(kafka_client, topics, raise_on_error) result = {} for topic, partitions in six.iteritems(group_offsets): result[topic] = [ ConsumerPartitionOffsets( topic=topic, partition=partition, current=group_offsets[topic][partition], highmark=watermarks[topic][partition].highmark, lowmark=watermarks[topic][partition].lowmark, ) for partition in partitions ] return result
def get_first_offset_at_or_after_start_timestamp(kafka_client, topics, start_timestamp): """Uses binary search to find the first offset that comes after start_timestamp for each topic in topics. If multiple items are present for a timestamp, the first one (closer to low_mark) is returned back. Outputs a result_topic_to_consumer_topic_state_map which can be used to set offsets :param kafka_client: kafka client to be used for getting watermarks and binary search. :param topics: a list of topics. eg. ['test_topic_1', 'test_topic_2'] :param start_timestamp: epoch timestamp eg. 1463086536 :returns: a dict mapping topic to the nearest starting timestamp. eg. {'test_topic_1': ConsumerTopicState({0: 43}, None), 'test_topic_2': ConsumerTopicState({0: 55, 1: 32}, None)} """ watermarks = offsets.get_topics_watermarks( kafka_client, topics, raise_on_error=False ) topic_to_consumer_topic_state_map = _build_topic_to_consumer_topic_state_map(watermarks) topic_to_range_map = _build_topic_to_range_map(watermarks) result_topic_to_consumer_topic_state_map = _build_empty_topic_to_consumer_topic_state_map(topics) _move_finished_topics_to_result_map( topic_to_consumer_topic_state_map, topic_to_range_map, result_topic_to_consumer_topic_state_map ) while topic_to_consumer_topic_state_map: _get_message_and_alter_range( start_timestamp, topic_to_consumer_topic_state_map, topic_to_range_map, result_topic_to_consumer_topic_state_map ) logger.info( "Got topic offsets based on start-date: {}".format(result_topic_to_consumer_topic_state_map) ) return result_topic_to_consumer_topic_state_map
def get_actual_published_messages_count( kafka_client, topics, topic_tracked_offset_map, raise_on_error=True, ): """Get the actual number of published messages of specified topics. Args: kafka_client (kafka.client.KafkaClient): kafka client topics ([str]): List of topic names to get message count topic_tracked_offset_map (dict(str, int)): dictionary which contains each topic and its current stored offset value. raise_on_error (Optional[bool]): if False, the function ignores missing topics and missing partitions. It still may fail on the request send. Default to True. Returns: dict(str, int): Each topic and its actual published messages count since last offset. If a topic or partition is missing when `raise_on_error` is False, the returned dict will not contain the missing topic. Raises: :class:`~yelp_kafka.error.UnknownTopic`: upon missing topics and raise_on_error=True :class:`~yelp_kafka.error.UnknownPartition`: upon missing partitions and raise_on_error=True FailedPayloadsError: upon send request error. """ topic_watermarks = get_topics_watermarks( kafka_client, topics, raise_on_error=raise_on_error ) topic_to_published_msgs_count = {} for topic, partition_offsets in topic_watermarks.iteritems(): high_watermark = partition_offsets[0].highmark offset = topic_tracked_offset_map.get(topic, 0) topic_to_published_msgs_count[topic] = high_watermark - offset return topic_to_published_msgs_count
def get_actual_published_messages_count( kafka_client, topics, topic_tracked_offset_map, raise_on_error=True, ): """Get the actual number of published messages of specified topics. Args: kafka_client (kafka.client.KafkaClient): kafka client topics ([str]): List of topic names to get message count topic_tracked_offset_map (dict(str, int)): dictionary which contains each topic and its current stored offset value. raise_on_error (Optional[bool]): if False, the function ignores missing topics and missing partitions. It still may fail on the request send. Default to True. Returns: dict(str, int): Each topic and its actual published messages count since last offset. If a topic or partition is missing when `raise_on_error` is False, the returned dict will not contain the missing topic. Raises: :class:`~yelp_kafka.error.UnknownTopic`: upon missing topics and raise_on_error=True :class:`~yelp_kafka.error.UnknownPartition`: upon missing partitions and raise_on_error=True FailedPayloadsError: upon send request error. """ topic_watermarks = get_topics_watermarks(kafka_client, topics, raise_on_error=raise_on_error) topic_to_published_msgs_count = {} for topic, partition_offsets in topic_watermarks.iteritems(): high_watermark = partition_offsets[0].highmark offset = topic_tracked_offset_map.get(topic, 0) topic_to_published_msgs_count[topic] = high_watermark - offset return topic_to_published_msgs_count
def _verify_position_and_highwatermarks( self, topics, producer, message_count ): topics_details = get_topics_watermarks( kafka_client=producer._kafka_producer.kafka_client, topics=topics, raise_on_error=True ) position_tracker = producer._kafka_producer.position_data_tracker for topic in topics: actual_hwm = topics_details[topic][0][2] expected_hwm = position_tracker.topic_to_kafka_offset_map[topic] assert actual_hwm == expected_hwm position_info = producer.get_checkpoint_position_data() last_position = position_info.last_published_message_position_info assert last_position['position'] == message_count
def get_watermark_for_topic( kafka_client, topic, ): """This method: * refreshes metadata for the kafka client * fetches watermarks :param kafka_client: KafkaToolClient instance :param topic: the topic :returns: dict <topic>: [ConsumerPartitionOffsets] """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() watermarks = get_topics_watermarks(kafka_client, [topic]) return watermarks
def get_watermark_for_topic( kafka_client, topic, ): """This method: * refreshes metadata for the kafka client * fetches watermarks :param kafka_client: KafkaToolClient instance :param topic: the topic :returns: dict <topic>: [ConsumerPartitionOffsets] """ # Refresh client metadata. We do not use the topic list, because we # don't want to accidentally create the topic if it does not exist. # If Kafka is unavailable, let's retry loading client metadata try: kafka_client.load_metadata_for_topics() except KafkaUnavailableError: kafka_client.load_metadata_for_topics() watermarks = get_topics_watermarks( kafka_client, [topic] ) return watermarks
def test_get_topics_watermarks_unknown_topic(self, kafka_client_mock): with pytest.raises(UnknownTopic): get_topics_watermarks( kafka_client_mock, ["something that doesn't exist"], )
def test_get_topics_watermarks_unknown_partitions(self, kafka_client_mock): with pytest.raises(UnknownPartitions): get_topics_watermarks( kafka_client_mock, {'topic1': [99]}, )
def test_get_topics_watermarks_unknown_partitions(self, kafka_client_mock): with pytest.raises(UnknownPartitions): get_topics_watermarks( kafka_client_mock, {'topic1': [99]}, )
def test_get_topics_watermarks_invalid_arguments(self, kafka_client_mock): with pytest.raises(TypeError): get_topics_watermarks( kafka_client_mock, "this should be a list or dict", )
def test_get_topics_watermarks_invalid_arguments(self, kafka_client_mock): with pytest.raises(TypeError): get_topics_watermarks( kafka_client_mock, "this should be a list or dict", )
def test_get_topics_watermarks_unknown_topic(self, kafka_client_mock): with pytest.raises(UnknownTopic): get_topics_watermarks( kafka_client_mock, ["something that doesn't exist"], )
def _all_topic_watermarks(self): topics = self._kafka_topics with self._kafka_client() as kafka_client: return offsets.get_topics_watermarks(kafka_client, topics)
def test_get_topics_watermarks_invalid_partition_subset(self, kafka_client_mock): with pytest.raises(UnknownPartitions): get_topics_watermarks( kafka_client_mock, {'topic1': [1, 99]}, )