def test__should_keep_trying_no_timeout(self, cluster): config = KafkaConsumerConfig(self.group, cluster, consumer_timeout_ms=-1) consumer = KafkaConsumerGroup([], config) long_time_ago = time.time() - 1000 assert consumer._should_keep_trying(long_time_ago)
def test__acquire_has_no_consumer(self, mock_consumer, cluster, example_partitions): config = KafkaConsumerConfig(self.group, cluster) consumer = KafkaConsumerGroup([], config) consumer._acquire(example_partitions) mock_consumer.assert_called_once_with(example_partitions, **consumer.config)
def consume(): consumer = KafkaConsumerGroup([topic], config) with consumer: while True: try: message = consumer.next() queue.put(message) consumer.task_done(message) except ConsumerTimeout: return
def test__should_keep_trying_timed_out(self, mock_time, cluster): mock_time.return_value = 0 config = KafkaConsumerConfig(self.group, cluster, consumer_timeout_ms=1000) consumer = KafkaConsumerGroup([], config) over_a_second_ago = time.time() - 1.2 assert not consumer._should_keep_trying(over_a_second_ago)
def test__release_retry(self, cluster): config = KafkaConsumerConfig(self.group, cluster, auto_commit_enable=True) consumer = KafkaConsumerGroup([], config) mock_consumer = mock.Mock() mock_consumer.set_topic_partitions.side_effect = KafkaUnavailableError consumer.consumer = mock_consumer with pytest.raises(KafkaUnavailableError): consumer._release({}) assert mock_consumer.set_topic_partitions.call_count == 2
def test__acquire_has_consumer(self, cluster, example_partitions, mock_post_rebalance_cb): config = KafkaConsumerConfig( self.group, cluster, post_rebalance_callback=mock_post_rebalance_cb) consumer = KafkaConsumerGroup([], config) consumer.consumer = mock.Mock() consumer._acquire(example_partitions) consumer.consumer.set_topic_partitions.assert_called_once_with( example_partitions) mock_post_rebalance_cb.assert_called_once_with(example_partitions)
def test__release(self, cluster, example_partitions, mock_pre_rebalance_cb): config = KafkaConsumerConfig( self.group, cluster, auto_commit_enable=True, pre_rebalance_callback=mock_pre_rebalance_cb) consumer = KafkaConsumerGroup([], config) mock_consumer = mock.Mock() consumer.consumer = mock_consumer consumer._release(example_partitions) mock_consumer.commit.assert_called_once_with() mock_consumer.set_topic_partitions.assert_called_once_with({}) mock_pre_rebalance_cb.assert_called_once_with(example_partitions)
def test_next(self, mock_consumer, mock_partitioner, cluster): config = KafkaConsumerConfig(self.group, cluster, consumer_timeout_ms=500) consumer = KafkaConsumerGroup([], config) consumer.partitioner = mock_partitioner() consumer.consumer = mock_consumer() def fake_next(): time.sleep(1) raise ConsumerTimeout() consumer.consumer.next.side_effect = fake_next # The mock KafkaConsumer.next (called fake_next above) takes longer than # consumer_timeout_ms, so we should get a ConsumerTimeout from # KafkaConsumerGroup with pytest.raises(ConsumerTimeout): consumer.next() consumer.consumer.next.assert_called_once_with() consumer.partitioner.refresh.assert_called_once_with()
class Consumer(BaseConsumer): """ The Consumer uses an iterator to get messages that need to be consumed from Kafka. Args: consumer_name (str): See parameter `client_name` in :class:`data_pipeline.client.Client`. The `consumer_name` will be registered with Kafka to commit offsets. team_name (str): See parameter `team_name` in :class:`data_pipeline.client.Client`. expected_frequency_seconds (int, ExpectedFrequency): See parameter `expected_frequency_seconds` in :class:`data_pipeline.client.Client`. topic_to_consumer_topic_state_map ({str:Optional(ConsumerTopicState)}): A map of topic names to `ConsumerTopicState` objects which define the offsets to start from. The ConsumerTopicState of a topic may be `None`, in which case the committed kafka offset for the consumer_name is used. If there is no committed kafka offset for the consumer_name the consumer will begin from the `auto_offset_reset` offset in the topic. consumer_source (ConsumerSource): Object to specify the topics this consumer consumes messages from. It must be a :class:`data_pipeline.consumer_source.ConsumerSource` object. For example, to process messages from a fixed set of topics, use :class:`data_pipeline.consumer_source.FixedTopics`. In case of FixedSchema consumer source, at most one schema_id can be provided per topic. Consumer would use that schema as reader schema to decode the message. In case no schema id for a topic is specified, then the Consumer would use the schema id that the message was encoded with to decode the message. auto_offset_reset (str): automatically resets the offset when there is no initial offset in Zookeeper or if an offset is out of range. If 'largest', reset the offset to the latest available message (tail). If 'smallest' reset from the earliest (head). partitioner_cooldown (float): Waiting time (in seconds) for the consumer to acquire the partitions. See yelp_kafka/yelp_kafka/partitioner.py for more details use_group_sha (Optional[boolean]): Used by partitioner to establish group membership. If false, consumer group with same name will be treated as the same group; otherwise, they will be different since group sha is different. Default is true. pre_rebalance_callback (Optional[Callable[{str:list[int]}, None]]): Optional callback which is passed a dict of topic as key and list of partitions as value. It's important to note this may be called multiple times in a single repartition, so any actions taken as a result must be idempotent. You are guaranteed that no messages will be consumed between this callback and the post_rebalance_callback. post_rebalance_callback (Optional[Callable[{str:list[int]}, None]]): Optional callback which is passed a dict of topic as key and list of partitions as value which were acquired in a repartition. You are guaranteed that no messages will be consumed between the pre_rebalance_callback and this callback. fetch_offsets_for_topics: (Optional[Callable[List[str], Dict[str, Optional[Dict[int, int]]]]]): Optional callback which is passed a list of topics, and should return a dictionary where keys are topic names and values are either None if no offset should be manually set, or a map from partition to offset. If implemented, this function will be called every time consumer refreshes the topics, so that the consumer can provide a map of partitions to offsets for each topic, or None if the default behavior should be employed instead. The default behavior is picking up the last committed offsets of topics. This method must be implemented if topic state is to be stored in some system other than Kafka, for example when writing data from Kafka into a transactional store. pre_topic_refresh_callback: (Optional[Callable[[set[str], set[str]], Any]]): Optional callback that gets executed right before the consumer is about to refresh the topics. The callback function is passed in a set of topic names Consumer is currently consuming from (current_topics) and a set of topic names Consumer will be consuming from (refreshed_topics). The return value of the function is ignored. Note: The Consumer leverages the yelp_kafka `KafkaConsumerGroup`. **Examples**: A simple example can be a consumer with name 'my_consumer' that consumes a message from multiple topics, processes it and commits the offset and this process continues:: with Consumer( consumer_name='my_consumer', team_name='bam', expected_frequency_seconds=12345, topic_to_consumer_topic_state_map={ 'topic_a': None, 'topic_b': None } ) as consumer: while True: message = consumer.get_message() if message is not None: ... do stuff with message ... consumer.commit_message(message) Note: Recommended to avoid calling `commit_message(message)` after every message, as it is relatively expensive. Another example can be a consumer which consumes multiple messages (with maximum number of messages in batch as `count`) from 2 topics 'topic_a' and 'topic_b', processes them and commits them:: with Consumer( consumer_name='my_consumer', team_name='bam', expected_frequency_seconds=12345, topic_to_consumer_topic_state_map={ 'topic_a': None, 'topic_b': None } ) as consumer: while True: messages = consumer.get_messages( count=batch_size, blocking=True, timeout=batch_timeout ) if messages: ... do stuff with messages ... consumer.commit_messages(messages) Note: It's recommended to retrieve messages in batches via `get_messages(..)`, do your work with them, and then commit them as a group with a single call to `commit_messages(..)` """ def _start(self): self.consumer_group = KafkaConsumerGroup( topics=self.topic_to_partition_map.keys(), config=self._kafka_consumer_config ) self.consumer_group.start() def _stop(self): self.consumer_group.stop() def get_messages( self, count, blocking=False, timeout=get_config().consumer_get_messages_timeout_default ): """ Retrieve a list of messages from the message buffer, optionally blocking until the requested number of messages has been retrieved. Warning: If `blocking` is True and `timeout` is None this will block until the requested number of messages is retrieved, potentially blocking forever. Please be absolutely sure this is what you are intending if you use these options! Args: count (int): Number of messages to retrieve blocking (boolean): Set to True to block while waiting for messages if the buffer has been depleted. Otherwise returns immediately if the buffer reaches depletion. Default is False. timeout (double): Maximum time (in seconds) to wait if blocking is set to True. Set to None to wait indefinitely. Returns: ([data_pipeline.message.Message]): List of Message objects with maximum size `count`, but may be smaller or empty depending on how many messages were retrieved within the timeout. """ # TODO(tajinder|DATAPIPE-1231): Consumer should refresh topics # periodically even if NO timeout is provided and there are no # messages to consume. messages = [] has_timeout = timeout is not None if has_timeout: max_time = time() + timeout while len(messages) < count: # Consumer refreshes the topics periodically only if consumer_source # is specified and would use the `fetch_offsets_for_topics` callback # to get the partition offsets corresponding to the topics. if self.consumer_source: self._refresh_source_topics_if_necessary() try: default_iter_timeout = self.consumer_group.iter_timeout # Converting seconds to milliseconds self.consumer_group.iter_timeout = timeout * 1000 kafka_message = self._get_next_kafka_message( blocking, has_timeout, max_time ) # It's possible kafka_message is None if we used all our time # stuck getting EINTR IOErrors if kafka_message: message = create_from_kafka_message( kafka_message, self._envelope, self.force_payload_decode, reader_schema_id=self._topic_to_reader_schema_map.get( kafka_message.topic ) ) messages.append(message) # Update state in registrar for Producer/Consumer # registration in milliseconds self.registrar.update_schema_last_used_timestamp( message.reader_schema_id, timestamp_in_milliseconds=long(1000 * time()) ) if self._break_consume_loop(blocking, has_timeout, max_time): break except ConsumerTimeout: break finally: self.consumer_group.iter_timeout = default_iter_timeout return messages def _get_next_kafka_message( self, blocking, has_timeout, max_time ): """ Helper function which will retry when encountering an IOError with the errno of EINTR. This is now standard behavior in Python3.5. For more details see https://www.python.org/dev/peps/pep-0475/ """ while not self._break_consume_loop(blocking, has_timeout, max_time): try: return self.consumer_group.next() except IOError as e: if e.errno != errno.EINTR: raise return None def _break_consume_loop(self, blocking, has_timeout, max_time): return not blocking or (has_timeout and time() > max_time) def _refresh_source_topics_if_necessary(self): if not self._refresh_timer.should_tick(): return current_topics = set(self.topic_to_partition_map.keys()) refreshed_topics = set(self.consumer_source.get_topics()) if current_topics == refreshed_topics: return all_topics_to_state_map = self._get_topic_to_offset_map( current_topics.union(refreshed_topics) ) refreshed_topics_to_state_map = { topic: all_topics_to_state_map.get(topic) for topic in refreshed_topics } if self.pre_topic_refresh_callback: self.pre_topic_refresh_callback(current_topics, refreshed_topics) self.stop() self._commit_topic_offsets(all_topics_to_state_map) self._set_topic_to_partition_map(refreshed_topics_to_state_map) self._start_consumer()
def _start(self): self.consumer_group = KafkaConsumerGroup( topics=self.topic_to_partition_map.keys(), config=self._kafka_consumer_config ) self.consumer_group.start()
class Consumer(BaseConsumer): """ The Consumer uses an iterator to get messages that need to be consumed from Kafka. Args: consumer_name (str): See parameter `client_name` in :class:`data_pipeline.client.Client`. The `consumer_name` will be registered with Kafka to commit offsets. team_name (str): See parameter `team_name` in :class:`data_pipeline.client.Client`. expected_frequency_seconds (int, ExpectedFrequency): See parameter `expected_frequency_seconds` in :class:`data_pipeline.client.Client`. topic_to_consumer_topic_state_map ({str:Optional(ConsumerTopicState)}): A map of topic names to `ConsumerTopicState` objects which define the offsets to start from. The ConsumerTopicState of a topic may be `None`, in which case the committed kafka offset for the consumer_name is used. If there is no committed kafka offset for the consumer_name the consumer will begin from the `auto_offset_reset` offset in the topic. consumer_source (ConsumerSource): Object to specify the topics this consumer consumes messages from. It must be a :class:`data_pipeline.consumer_source.ConsumerSource` object. For example, to process messages from a fixed set of topics, use :class:`data_pipeline.consumer_source.FixedTopics`. In case of FixedSchema consumer source, at most one schema_id can be provided per topic. Consumer would use that schema as reader schema to decode the message. In case no schema id for a topic is specified, then the Consumer would use the schema id that the message was encoded with to decode the message. auto_offset_reset (str): automatically resets the offset when there is no initial offset in Zookeeper or if an offset is out of range. If 'largest', reset the offset to the latest available message (tail). If 'smallest' reset from the earliest (head). partitioner_cooldown (float): Waiting time (in seconds) for the consumer to acquire the partitions. See yelp_kafka/yelp_kafka/partitioner.py for more details use_group_sha (Optional[boolean]): Used by partitioner to establish group membership. If false, consumer group with same name will be treated as the same group; otherwise, they will be different since group sha is different. Default is true. pre_rebalance_callback (Optional[Callable[{str:list[int]}, None]]): Optional callback which is passed a dict of topic as key and list of partitions as value. It's important to note this may be called multiple times in a single repartition, so any actions taken as a result must be idempotent. You are guaranteed that no messages will be consumed between this callback and the post_rebalance_callback. post_rebalance_callback (Optional[Callable[{str:list[int]}, None]]): Optional callback which is passed a dict of topic as key and list of partitions as value which were acquired in a repartition. You are guaranteed that no messages will be consumed between the pre_rebalance_callback and this callback. fetch_offsets_for_topics: (Optional[Callable[List[str], Dict[str, Optional[Dict[int, int]]]]]): Optional callback which is passed a list of topics, and should return a dictionary where keys are topic names and values are either None if no offset should be manually set, or a map from partition to offset. If implemented, this function will be called every time consumer refreshes the topics, so that the consumer can provide a map of partitions to offsets for each topic, or None if the default behavior should be employed instead. The default behavior is picking up the last committed offsets of topics. This method must be implemented if topic state is to be stored in some system other than Kafka, for example when writing data from Kafka into a transactional store. pre_topic_refresh_callback: (Optional[Callable[[set[str], set[str]], Any]]): Optional callback that gets executed right before the consumer is about to refresh the topics. The callback function is passed in a set of topic names Consumer is currently consuming from (current_topics) and a set of topic names Consumer will be consuming from (refreshed_topics). The return value of the function is ignored. Note: The Consumer leverages the yelp_kafka `KafkaConsumerGroup`. **Examples**: A simple example can be a consumer with name 'my_consumer' that consumes a message from multiple topics, processes it and commits the offset and this process continues:: with Consumer( consumer_name='my_consumer', team_name='bam', expected_frequency_seconds=12345, topic_to_consumer_topic_state_map={ 'topic_a': None, 'topic_b': None } ) as consumer: while True: message = consumer.get_message() if message is not None: ... do stuff with message ... consumer.commit_message(message) Note: Recommended to avoid calling `commit_message(message)` after every message, as it is relatively expensive. Another example can be a consumer which consumes multiple messages (with maximum number of messages in batch as `count`) from 2 topics 'topic_a' and 'topic_b', processes them and commits them:: with Consumer( consumer_name='my_consumer', team_name='bam', expected_frequency_seconds=12345, topic_to_consumer_topic_state_map={ 'topic_a': None, 'topic_b': None } ) as consumer: while True: messages = consumer.get_messages( count=batch_size, blocking=True, timeout=batch_timeout ) if messages: ... do stuff with messages ... consumer.commit_messages(messages) Note: It's recommended to retrieve messages in batches via `get_messages(..)`, do your work with them, and then commit them as a group with a single call to `commit_messages(..)` """ def _start(self): self.consumer_group = KafkaConsumerGroup( topics=self.topic_to_partition_map.keys(), config=self._kafka_consumer_config ) self.consumer_group.start() def _stop(self): self.consumer_group.stop() def get_messages( self, count, blocking=False, timeout=get_config().consumer_get_messages_timeout_default ): """ Retrieve a list of messages from the message buffer, optionally blocking until the requested number of messages has been retrieved. Warning: If `blocking` is True and `timeout` is None this will block until the requested number of messages is retrieved, potentially blocking forever. Please be absolutely sure this is what you are intending if you use these options! Args: count (int): Number of messages to retrieve blocking (boolean): Set to True to block while waiting for messages if the buffer has been depleted. Otherwise returns immediately if the buffer reaches depletion. Default is False. timeout (double): Maximum time (in seconds) to wait if blocking is set to True. Set to None to wait indefinitely. Returns: ([data_pipeline.message.Message]): List of Message objects with maximum size `count`, but may be smaller or empty depending on how many messages were retrieved within the timeout. """ # TODO(tajinder|DATAPIPE-1231): Consumer should refresh topics # periodically even if NO timeout is provided and there are no # messages to consume. messages = [] has_timeout = timeout is not None if has_timeout: max_time = time() + timeout while len(messages) < count: # Consumer refreshes the topics periodically only if consumer_source # is specified and would use the `fetch_offsets_for_topics` callback # to get the partition offsets corresponding to the topics. if self.consumer_source: self._refresh_source_topics_if_necessary() try: default_iter_timeout = self.consumer_group.iter_timeout # Converting seconds to milliseconds self.consumer_group.iter_timeout = timeout * 1000 message = self.consumer_group.next() except ConsumerTimeout: break finally: self.consumer_group.iter_timeout = default_iter_timeout message = create_from_kafka_message( message, self._envelope, self.force_payload_decode, reader_schema_id=self._topic_to_reader_schema_map.get(message.topic) ) messages.append(message) # Update state in registrar for Producer/Consumer registration in milliseconds self.registrar.update_schema_last_used_timestamp( message.reader_schema_id, timestamp_in_milliseconds=long(1000 * time()) ) if not blocking or (has_timeout and time() > max_time): break return messages def _refresh_source_topics_if_necessary(self): if not self._refresh_timer.should_tick(): return current_topics = set(self.topic_to_partition_map.keys()) refreshed_topics = set(self.consumer_source.get_topics()) if current_topics == refreshed_topics: return all_topics_to_state_map = self._get_topic_to_offset_map( current_topics.union(refreshed_topics) ) refreshed_topics_to_state_map = { topic: all_topics_to_state_map.get(topic) for topic in refreshed_topics } if self.pre_topic_refresh_callback: self.pre_topic_refresh_callback(current_topics, refreshed_topics) self.stop() self._commit_topic_offsets(all_topics_to_state_map) self._set_topic_to_partition_map(refreshed_topics_to_state_map) self._start_consumer()
def test__auto_commit_enabled_not_enabled(self, cluster): config = KafkaConsumerConfig(self.group, cluster, auto_commit_enable=False) consumer = KafkaConsumerGroup([], config) assert not consumer._auto_commit_enabled()
def test___init__string_topics(self): with pytest.raises(AssertionError): KafkaConsumerGroup(self.topic, None)