def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({ 'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb }) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list( map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class SynchronizedConsumer: """ This class implements the framework for a consumer that is intended to only consume messages that have already been consumed and committed by members of another consumer group. This works similarly to the Kafka built-in ``__consumer_offsets`` topic. The consumer group that is being "followed" (the one that must make progress for our consumer here to make progress, identified by the ``synchronize_commit_group`` constructor parameter/instance attribute) must report its offsets to a topic (identified by the ``commit_log_topic`` constructor parameter/instance attribute). This consumer subscribes to both commit log topic, as well as the topic(s) that we are actually interested in consuming messages from. The messages received from the commit log topic control whether or not consumption from partitions belonging to the main topic is paused, resumed, or allowed to continue in its current state without changes. The furthest point in any partition that this consumer should ever consume to is the maximum offset that has been recorded to the commit log topic for that partition. If the offsets recorded to that topic move non-monotonically (due to an intentional offset rollback, for instance) this consumer *may* consume up to the highest watermark point. (The implementation here tries to pause consuming from the partition as soon as possible, but this makes no explicit guarantees about that behavior.) """ initial_offset_reset_strategies = { "earliest": get_earliest_offset, "latest": get_latest_offset } def __init__( self, cluster_name, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset="latest", on_commit=None, ): self.cluster_name = cluster_name self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[ initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change) ( self.__commit_log_consumer, self.__commit_log_consumer_stop_request, ) = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = kafka_config.get_kafka_consumer_cluster_options( cluster_name, override_params={ "group.id": self.consumer_group, "enable.auto.commit": "false", "enable.auto.offset.store": "true", "enable.partition.eof": "false", "default.topic.config": { "auto.offset.reset": "error" }, "on_commit": commit_callback, }, ) self.__consumer = Consumer(consumer_configuration) def __start_commit_log_consumer(self, timeout=None): """ Starts running the commit log consumer. """ stop_request_event = threading.Event() start_event = threading.Event() result = execute( functools.partial( run_commit_log_consumer, cluster_name=self.cluster_name, consumer_group=f"{self.consumer_group}:sync:{uuid.uuid1().hex}", commit_log_topic=self.commit_log_topic, synchronize_commit_group=self.synchronize_commit_group, partition_state_manager=self.__partition_state_manager, start_event=start_event, stop_request_event=stop_request_event, )) start_event.wait(timeout) return result, stop_request_event def __check_commit_log_consumer_running(self): if not self.__commit_log_consumer.running(): try: result = self.__commit_log_consumer.result(timeout=0) # noqa except TimeoutError: pass # not helpful raise Exception("Commit log consumer unexpectedly exit!") def __on_partition_state_change(self, topic, partition, previous_state_and_offsets, current_state_and_offsets): """ Callback that is invoked when a partition state changes. """ logger.debug( "State change for %r: %r to %r", (topic, partition), previous_state_and_offsets, current_state_and_offsets, ) current_state, current_offsets = current_state_and_offsets if current_offsets.local is None: # It only makes sense to manipulate the consumer if we've got an # assignment. (This block should only be entered at startup if the # remote offsets are retrieved from the commit log before the local # consumer has received its assignment.) return # TODO: This will be called from the commit log consumer thread, so need # to verify that calling the ``consumer.{pause,resume}`` methods is # thread safe! if current_state in ( SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED, SynchronizedPartitionState.REMOTE_BEHIND, ): self.__consumer.pause( [TopicPartition(topic, partition, current_offsets.local)]) elif current_state is SynchronizedPartitionState.LOCAL_BEHIND: self.__consumer.resume( [TopicPartition(topic, partition, current_offsets.local)]) else: raise NotImplementedError( f"Unexpected partition state: {current_state}") def subscribe(self, topics, on_assign=None, on_revoke=None): """ Subscribe to a topic. """ self.__check_commit_log_consumer_running() def assignment_callback(consumer, assignment): # Since ``auto.offset.reset`` is set to ``error`` to force human # interaction on an offset reset, we have to explicitly specify the # starting offset if no offset has been committed for this topic during # the ``__consumer_offsets`` topic retention period. assignment = {(i.topic, i.partition): self.__positions.get( (i.topic, i.partition)) for i in assignment} for i in self.__consumer.committed([ TopicPartition(topic, partition) for (topic, partition), offset in assignment.items() if offset is None ]): k = (i.topic, i.partition) if i.offset > -1: assignment[k] = i.offset else: assignment[k] = self.initial_offset_reset( consumer, i.topic, i.partition) self.__consumer.assign([ TopicPartition(topic, partition, offset) for (topic, partition), offset in assignment.items() ]) for (topic, partition), offset in assignment.items(): # Setting the local offsets will either cause the partition to be # paused (if the remote offset is unknown or the local offset is # not trailing the remote offset) or resumed. self.__partition_state_manager.set_local_offset( topic, partition, offset) self.__positions[(topic, partition)] = offset if on_assign is not None: on_assign( self, [ TopicPartition(topic, partition) for topic, partition in assignment.keys() ], ) def revocation_callback(consumer, assignment): for item in assignment: # TODO: This should probably also be removed from the state manager. self.__positions.pop((item.topic, item.partition)) if on_revoke is not None: on_revoke(self, assignment) self.__consumer.subscribe(topics, on_assign=assignment_callback, on_revoke=revocation_callback) def poll(self, timeout): self.__check_commit_log_consumer_running() message = self.__consumer.poll(timeout) if message is None: return if message.error() is not None: return message self.__partition_state_manager.validate_local_message( message.topic(), message.partition(), message.offset()) self.__partition_state_manager.set_local_offset( message.topic(), message.partition(), message.offset() + 1) self.__positions[(message.topic(), message.partition())] = message.offset() + 1 return message def commit(self, *args, **kwargs): self.__check_commit_log_consumer_running() return self.__consumer.commit(*args, **kwargs) def close(self): self.__check_commit_log_consumer_running() self.__commit_log_consumer_stop_request.set() try: self.__consumer.close() finally: self.__commit_log_consumer.result()
class SynchronizedConsumer(object): """ This class implements the framework for a consumer that is intended to only consume messages that have already been consumed and committed by members of another consumer group. This works similarly to the Kafka built-in ``__consumer_offsets`` topic. The consumer group that is being "followed" (the one that must make progress for our consumer here to make progress, identified by the ``synchronize_commit_group`` constructor parameter/instance attribute) must report its offsets to a topic (identified by the ``commit_log_topic`` constructor parameter/instance attribute). This consumer subscribes to both commit log topic, as well as the topic(s) that we are actually interested in consuming messages from. The messages received from the commit log topic control whether or not consumption from partitions belonging to the main topic is paused, resumed, or allowed to continue in its current state without changes. The furthest point in any partition that this consumer should ever consume to is the maximum offset that has been recorded to the commit log topic for that partition. If the offsets recorded to that topic move non-monotonically (due to an intentional offset rollback, for instance) this consumer *may* consume up to the highest watermark point. (The implementation here tries to pause consuming from the partition as soon as possible, but this makes no explicit guarantees about that behavior.) """ initial_offset_reset_strategies = { 'earliest': get_earliest_offset, 'latest': get_latest_offset, } def __init__(self, bootstrap_servers, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset='latest', on_commit=None): self.bootstrap_servers = bootstrap_servers self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change) self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = { 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, 'on_commit': commit_callback, } self.__consumer = Consumer(consumer_configuration) def __start_commit_log_consumer(self, timeout=None): """ Starts running the commit log consumer. """ stop_request_event = threading.Event() start_event = threading.Event() result = execute( functools.partial( run_commit_log_consumer, bootstrap_servers=self.bootstrap_servers, consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex), commit_log_topic=self.commit_log_topic, synchronize_commit_group=self.synchronize_commit_group, partition_state_manager=self.__partition_state_manager, start_event=start_event, stop_request_event=stop_request_event, ), ) start_event.wait(timeout) return result, stop_request_event def __check_commit_log_consumer_running(self): if not self.__commit_log_consumer.running(): try: result = self.__commit_log_consumer.result(timeout=0) # noqa except TimeoutError: pass # not helpful raise Exception('Commit log consumer unexpectedly exit!') def __on_partition_state_change( self, topic, partition, previous_state_and_offsets, current_state_and_offsets): """ Callback that is invoked when a partition state changes. """ logger.debug('State change for %r: %r to %r', (topic, partition), previous_state_and_offsets, current_state_and_offsets) current_state, current_offsets = current_state_and_offsets if current_offsets.local is None: # It only makes sense to manipulate the consumer if we've got an # assignment. (This block should only be entered at startup if the # remote offsets are retrieved from the commit log before the local # consumer has received its assignment.) return # TODO: This will be called from the commit log consumer thread, so need # to verify that calling the ``consumer.{pause,resume}`` methods is # thread safe! if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED, SynchronizedPartitionState.REMOTE_BEHIND): self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)]) elif current_state is SynchronizedPartitionState.LOCAL_BEHIND: self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)]) else: raise NotImplementedError('Unexpected partition state: %s' % (current_state,)) def subscribe(self, topics, on_assign=None, on_revoke=None): """ Subscribe to a topic. """ self.__check_commit_log_consumer_running() def assignment_callback(consumer, assignment): # Since ``auto.offset.reset`` is set to ``error`` to force human # interaction on an offset reset, we have to explicitly specify the # starting offset if no offset has been committed for this topic during # the ``__consumer_offsets`` topic retention period. assignment = { (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment } for i in self.__consumer.committed([TopicPartition(topic, partition) for ( topic, partition), offset in assignment.items() if offset is None]): k = (i.topic, i.partition) if i.offset > -1: assignment[k] = i.offset else: assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition) self.__consumer.assign([TopicPartition(topic, partition, offset) for (topic, partition), offset in assignment.items()]) for (topic, partition), offset in assignment.items(): # Setting the local offsets will either cause the partition to be # paused (if the remote offset is unknown or the local offset is # not trailing the remote offset) or resumed. self.__partition_state_manager.set_local_offset(topic, partition, offset) self.__positions[(topic, partition)] = offset if on_assign is not None: on_assign(self, [TopicPartition(topic, partition) for topic, partition in assignment.keys()]) def revocation_callback(consumer, assignment): for item in assignment: # TODO: This should probably also be removed from the state manager. self.__positions.pop((item.topic, item.partition)) if on_revoke is not None: on_revoke(self, assignment) self.__consumer.subscribe( topics, on_assign=assignment_callback, on_revoke=revocation_callback) def poll(self, timeout): self.__check_commit_log_consumer_running() message = self.__consumer.poll(timeout) if message is None: return if message.error() is not None: return message self.__partition_state_manager.validate_local_message( message.topic(), message.partition(), message.offset()) self.__partition_state_manager.set_local_offset( message.topic(), message.partition(), message.offset() + 1) self.__positions[(message.topic(), message.partition())] = message.offset() + 1 return message def commit(self, *args, **kwargs): self.__check_commit_log_consumer_running() return self.__consumer.commit(*args, **kwargs) def close(self): self.__check_commit_log_consumer_running() self.__commit_log_consumer_stop_request.set() try: self.__consumer.close() finally: self.__commit_log_consumer.result()
class KafkaConsumer(Consumer[TPayload]): """ The behavior of this consumer differs slightly from the Confluent consumer during rebalancing operations. Whenever a partition is assigned to this consumer, offsets are *always* automatically reset to the committed offset for that partition (or if no offsets have been committed for that partition, the offset is reset in accordance with the ``auto.offset.reset`` configuration value.) This causes partitions that are maintained across a rebalance to have the same offset management behavior as a partition that is moved from one consumer to another. To prevent uncommitted messages from being consumed multiple times, ``commit`` should be called in the partition revocation callback. The behavior of ``auto.offset.reset`` also differs slightly from the Confluent consumer as well: offsets are only reset during initial assignment or subsequent rebalancing operations. Any other circumstances that would otherwise lead to preemptive offset reset (e.g. the consumer tries to read a message that is before the earliest offset, or the consumer attempts to read a message that is after the latest offset) will cause an exception to be thrown, rather than resetting the offset, as this could lead to chunks messages being replayed or skipped, depending on the circumstances. This also means that if the committed offset is no longer available (such as when reading older messages from the log and those messages expire, or reading newer messages from the log and the leader crashes and partition ownership fails over to an out-of-date replica), the consumer will fail-stop rather than reset to the value of ``auto.offset.reset``. """ # Set of logical offsets that do not correspond to actual log positions. # These offsets should be considered an implementation detail of the Kafka # consumer and not used publically. # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25 LOGICAL_OFFSETS = frozenset( [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID]) def __init__( self, configuration: Mapping[str, Any], codec: Codec[KafkaPayload, TPayload], *, commit_retry_policy: Optional[RetryPolicy] = None, ) -> None: if commit_retry_policy is None: commit_retry_policy = NoRetryPolicy() auto_offset_reset = configuration.get("auto.offset.reset", "largest") if auto_offset_reset in {"smallest", "earliest", "beginning"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_earliest) elif auto_offset_reset in {"largest", "latest", "end"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_latest) elif auto_offset_reset == "error": self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_error) else: raise ValueError( "invalid value for 'auto.offset.reset' configuration") if (as_kafka_configuration_bool( configuration.get("enable.auto.commit", "true")) is not False): raise ValueError( "invalid value for 'enable.auto.commit' configuration") if (as_kafka_configuration_bool( configuration.get("enable.auto.offset.store", "true")) is not False): raise ValueError( "invalid value for 'enable.auto.offset.store' configuration") # NOTE: Offsets are explicitly managed as part of the assignment # callback, so preemptively resetting offsets is not enabled. self.__consumer = ConfluentConsumer({ **configuration, "auto.offset.reset": "error" }) self.__codec = codec self.__offsets: MutableMapping[Partition, int] = {} self.__staged_offsets: MutableMapping[Partition, int] = {} self.__paused: Set[Partition] = set() self.__commit_retry_policy = commit_retry_policy self.__state = KafkaConsumerState.CONSUMING def __resolve_partition_offset_earliest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, low) def __resolve_partition_offset_latest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, high) def __resolve_partition_offset_error( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: raise ConsumerError("unable to resolve partition offsets") def subscribe( self, topics: Sequence[Topic], on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None, on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None, ) -> None: """ Subscribe to topics. This replaces a previous subscription. This method does not block. The subscription may not be fulfilled immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks are called when the subscription state changes with the updated assignment for this consumer. If provided, the ``on_assign`` callback is called with a mapping of partitions to their offsets (at this point, the working offset and the committed offset are the same for each partition) on each subscription change. Similarly, the ``on_revoke`` callback (if provided) is called with a sequence of partitions that are being removed from this consumer's assignment. (This callback does not include the offsets, as the working offset and committed offset may differ, in some cases by substantial margin.) Raises an ``InvalidState`` exception if called on a closed consumer. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) def assignment_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.ASSIGNING try: assignment: MutableSequence[ConfluentTopicPartition] = [] for partition in self.__consumer.committed(partitions): if partition.offset >= 0: assignment.append(partition) elif partition.offset == OFFSET_INVALID: assignment.append( self.__resolve_partition_starting_offset( partition)) else: raise ValueError("received unexpected offset") offsets: MutableMapping[Partition, int] = { Partition(Topic(i.topic), i.partition): i.offset for i in assignment } self.__seek(offsets) # Ensure that all partitions are resumed on assignment to avoid # carrying over state from a previous assignment. self.__consumer.resume([ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in offsets.items() ]) for partition in offsets: self.__paused.discard(partition) except Exception: self.__state = KafkaConsumerState.ERROR raise try: if on_assign is not None: on_assign(offsets) finally: self.__state = KafkaConsumerState.CONSUMING def revocation_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.REVOKING partitions = [ Partition(Topic(i.topic), i.partition) for i in partitions ] try: if on_revoke is not None: on_revoke(partitions) finally: for partition in partitions: # Staged offsets are deleted during partition revocation to # prevent later committing offsets for partitions that are # no longer owned by this consumer. if partition in self.__staged_offsets: logger.warning( "Dropping staged offset for revoked partition (%r)!", partition, ) del self.__staged_offsets[partition] try: self.__offsets.pop(partition) except KeyError: # If there was an error during assignment, this # partition may have never been added to the offsets # mapping. logger.warning( "failed to delete offset for unknown partition: %r", partition, ) self.__paused.discard(partition) self.__state = KafkaConsumerState.CONSUMING self.__consumer.subscribe( [topic.name for topic in topics], on_assign=assignment_callback, on_revoke=revocation_callback, ) def unsubscribe(self) -> None: """ Unsubscribe from topics. Raises an ``InvalidState`` exception if called on a closed consumer. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) self.__consumer.unsubscribe() def poll(self, timeout: Optional[float] = None) -> Optional[Message[TPayload]]: """ Return the next message available to be consumed, if one is available. If no message is available, this method will block up to the ``timeout`` value before returning ``None``. A timeout of ``0.0`` represents "do not block", while a timeout of ``None`` represents "block until a message is available (or forever)". Calling this method may also invoke subscription state change callbacks. This method may also raise an ``EndOfPartition`` error (a subtype of ``ConsumerError``) when the consumer has reached the end of a partition that it is subscribed to and no additional messages are available. The ``partition`` attribute of the raised exception specifies the end which partition has been reached. (Since this consumer is multiplexing a set of partitions, this exception does not mean that *all* of the partitions that the consumer is subscribed to do not have any messages, just that it has reached the end of one of them. This also does not mean that additional messages won't be available in future poll calls.) Not every backend implementation supports this feature or is configured to raise in this scenario. Raises an ``InvalidState`` exception if called on a closed consumer. Raises a ``TransportError`` for various other consumption-related errors. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) message: Optional[ConfluentMessage] = self.__consumer.poll( *[timeout] if timeout is not None else []) if message is None: return None error: Optional[KafkaError] = message.error() if error is not None: code = error.code() if code == KafkaError._PARTITION_EOF: raise EndOfPartition( Partition(Topic(message.topic()), message.partition()), message.offset(), ) elif code == KafkaError._TRANSPORT: raise TransportError(str(error)) else: raise ConsumerError(str(error)) headers: Optional[Headers] = message.headers() result = Message( Partition(Topic(message.topic()), message.partition()), message.offset(), self.__codec.decode( KafkaPayload( message.key(), message.value(), headers if headers is not None else [], )), datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0), ) self.__offsets[result.partition] = result.get_next_offset() return result def tell(self) -> Mapping[Partition, int]: """ Return the read offsets for all assigned partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return self.__offsets def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None: invalid_offsets: Mapping[Partition, int] = { partition: offset for partition, offset in offsets.items() if offset < 0 } if invalid_offsets: raise ConsumerError(f"invalid offsets: {invalid_offsets!r}") def __seek(self, offsets: Mapping[Partition, int]) -> None: self.__validate_offsets(offsets) if self.__state is KafkaConsumerState.ASSIGNING: # Calling ``seek`` on the Confluent consumer from an assignment # callback will throw an "Erroneous state" error. Instead, # partition offsets have to be initialized by calling ``assign``. self.__consumer.assign([ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in offsets.items() ]) else: for partition, offset in offsets.items(): self.__consumer.seek( ConfluentTopicPartition(partition.topic.name, partition.index, offset)) self.__offsets.update(offsets) def seek(self, offsets: Mapping[Partition, int]) -> None: """ Change the read offsets for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError("cannot seek on unassigned partitions") self.__seek(offsets) def pause(self, partitions: Sequence[Partition]) -> None: """ Pause the consumption of messages for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if set(partitions) - self.__offsets.keys(): raise ConsumerError("cannot pause unassigned partitions") self.__consumer.pause([ ConfluentTopicPartition(partition.topic.name, partition.index) for partition in partitions ]) self.__paused.update(partitions) # XXX: Seeking to a specific partition offset and immediately pausing # that partition causes the seek to be ignored for some reason. self.seek({ partition: offset for partition, offset in self.__offsets.items() if partition in partitions }) def resume(self, partitions: Sequence[Partition]) -> None: """ Resume the consumption of messages for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if set(partitions) - self.__offsets.keys(): raise ConsumerError("cannot resume unassigned partitions") self.__consumer.resume([ ConfluentTopicPartition(partition.topic.name, partition.index) for partition in partitions ]) for partition in partitions: self.__paused.discard(partition) def paused(self) -> Sequence[Partition]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return [*self.__paused] def stage_offsets(self, offsets: Mapping[Partition, int]) -> None: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError( "cannot stage offsets for unassigned partitions") self.__validate_offsets(offsets) # TODO: Maybe log a warning if these offsets exceed the current # offsets, since that's probably a side effect of an incorrect usage # pattern? self.__staged_offsets.update(offsets) def __commit(self) -> Mapping[Partition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) result: Optional[Sequence[ConfluentTopicPartition]] if self.__staged_offsets: result = self.__consumer.commit( offsets=[ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in self.__staged_offsets.items() ], asynchronous=False, ) else: result = [] assert result is not None # synchronous commit should return result immediately self.__staged_offsets.clear() offsets: MutableMapping[Partition, int] = {} for value in result: # The Confluent Kafka Consumer will include logical offsets in the # sequence of ``Partition`` objects returned by ``commit``. These # are an implementation detail of the Kafka Consumer, so we don't # expose them here. # NOTE: These should no longer be seen now that we are forcing # offsets to be set as part of the assignment callback. if value.offset in self.LOGICAL_OFFSETS: continue assert value.offset >= 0, "expected non-negative offset" offsets[Partition(Topic(value.topic), value.partition)] = value.offset return offsets def commit_offsets(self) -> Mapping[Partition, int]: """ Commit staged offsets for all partitions that this consumer is assigned to. The return value of this method is a mapping of partitions with their committed offsets as values. Raises an ``InvalidState`` if called on a closed consumer. """ return self.__commit_retry_policy.call(self.__commit) def close(self, timeout: Optional[float] = None) -> None: """ Close the consumer. This stops consuming messages, *may* commit staged offsets (depending on the configuration), and ends its subscription. Raises a ``InvalidState`` if the consumer is unable to be closed before the timeout is reached. """ try: self.__consumer.close() except RuntimeError: pass self.__state = KafkaConsumerState.CLOSED @property def closed(self) -> bool: return self.__state is KafkaConsumerState.CLOSED
class Reader(): """Consumes traceroute data from Kafka""" def __init__(self, start, end, timetrack_converter, msm_ids=[5001, 5004, 5005], probe_ids=[1, 2, 3, 4, 5, 6, 7, 8], chunk_size=900, config=None): self.msm_ids = msm_ids self.probe_ids = probe_ids self.start = int(calendar.timegm(start.timetuple()))*1000 self.end = int(calendar.timegm(end.timetuple()))*1000 self.chunk_size = chunk_size self.params = [] self.timetrack_converter = timetrack_converter self.consumer = None self.config = config self.topic = self.config.get('io', 'kafka_topic') self.partition_total = 0 self.partition_paused = 0 def __enter__(self): """Setup kafka consumer""" self.consumer = Consumer({ 'bootstrap.servers': 'kafka1:9092, kafka2:9092, kafka3:9092', 'group.id': 'ihr_raclette_'+str(self.start), 'auto.offset.reset': 'earliest', 'max.poll.interval.ms': 1800*1000, }) # Set offsets according to start time topic_info = self.consumer.list_topics(self.topic) partitions = [TopicPartition(self.topic, partition_id, self.start) for partition_id in topic_info.topics[self.topic].partitions.keys()] offsets = self.consumer.offsets_for_times(partitions) # remove empty partitions offsets = [part for part in offsets if part.offset > 0] self.partition_total = len(offsets) self.partition_paused = 0 self.consumer.assign(offsets) return self def __exit__(self, type, value, traceback): self.consumer.close() logging.info("closed the consumer") def read(self): logging.info("Start consuming data") while True: msg = self.consumer.poll(1.0) if msg is None: continue if msg.error(): logging.error("Consumer error: {}".format(msg.error())) continue # Filter with start and end times ts = msg.timestamp() if ts[0] == confluent_kafka.TIMESTAMP_CREATE_TIME and ts[1] < self.start: continue if ts[0] == confluent_kafka.TIMESTAMP_CREATE_TIME and ts[1] >= self.end: self.consumer.pause([TopicPartition(self.topic, msg.partition())]) self.partition_paused += 1 if self.partition_paused < self.partition_total: continue else: break traceroute = msgpack.unpackb(msg.value(), raw=False) #needed? the consumer is not filtering the msm or probe ids # if (self.probe_ids is not None and traceroute['prb_id'] not in self.probe_ids) or \ # (self.msm_ids is not None and traceroute['msm_id'] not in self.msm_ids): # pass yield self.timetrack_converter.traceroute2timetrack(traceroute)
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()