def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets): """Request available fetch offsets for a single topic/partition Keyword Arguments: topic (str): topic for offset request partition (int): partition for offset request request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. max_num_offsets (int): Maximum offsets to include in the OffsetResponse Returns: a list of offsets in the OffsetResponse submitted for the provided topic / partition. See: https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI """ reqs = [OffsetRequest(topic, partition, request_time_ms, max_num_offsets)] (resp,) = self._client.send_offset_request(reqs) check_error(resp) # Just for sanity.. # probably unnecessary assert resp.topic == topic assert resp.partition == partition return resp.offsets
def fetch_last_known_offsets(self, partitions=None): if self.group is None: raise ValueError('KafkaClient.group must not be None') if partitions is None: partitions = self.client.get_partition_ids_for_topic(self.topic) responses = self.client.send_offset_fetch_request( self.group, [OffsetFetchRequest(self.topic, p) for p in partitions], fail_on_error=False) for resp in responses: try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self.offsets[resp.partition] = 0 # Otherwise we committed the stored offset # and need to fetch the next one else: self.offsets[resp.partition] = resp.offset
def fetch_last_known_offsets(self, partitions=None): if self.group is None: raise ValueError('KafkaClient.group must not be None') if partitions is None: partitions = self.client.get_partition_ids_for_topic(self.topic) responses = self.client.send_offset_fetch_request( self.group, [OffsetFetchRequest(self.topic, p) for p in partitions], fail_on_error=False ) for resp in responses: try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self.offsets[resp.partition] = 0 # Otherwise we committed the stored offset # and need to fetch the next one else: self.offsets[resp.partition] = resp.offset
def _raise_on_response_error(self, resp): # Response can be an unraised exception object (FailedPayloadsError) if isinstance(resp, Exception): raise resp # Or a server api error response try: kafka_common.check_error(resp) except (UnknownTopicOrPartitionError, NotLeaderForPartitionError): self.reset_topic_metadata(resp.topic) raise # Return False if no error to enable list comprehensions return False
def commit_partition_offsets(self, partition_offsets): """ Commit explicit partition/offset pairs. """ self.logger.debug("Committing partition offsets: %s", partition_offsets) commit_requests = [ OffsetCommitRequest(self.consumer.topic, partition, offset, None) for partition, offset in partition_offsets.items() ] commit_responses = self.consumer.client.send_offset_commit_request( self.consumer.group, commit_requests, ) for commit_response in commit_responses: check_error(commit_response)
def _get_coordinator_for_group(self, group): """ Returns the coordinator broker for a consumer group. ConsumerCoordinatorNotAvailableCode will be raised if the coordinator does not currently exist for the group. OffsetsLoadInProgressCode is raised if the coordinator is available but is still loading offsets from the internal topic """ resp = self.send_consumer_metadata_request(group) # If there's a problem with finding the coordinator, raise the # provided error kafka_common.check_error(resp) # Otherwise return the BrokerMetadata return BrokerMetadata(resp.nodeId, resp.host, resp.port)
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: (resp,) = self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = resp.offset
def commit(self): """Store consumed message offsets (marked via task_done()) to kafka cluster for this consumer_group. Returns: True on success, or False if no offsets were found for commit Note: this functionality requires server version >=0.8.1.1 https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI """ if not self._config['group_id']: logger.warning('Cannot commit without a group_id!') raise KafkaConfigurationError( 'Attempted to commit offsets ' 'without a configured consumer group (group_id)' ) # API supports storing metadata with each commit # but for now it is unused metadata = b'' offsets = self._offsets.task_done commits = [] for topic_partition, task_done_offset in six.iteritems(offsets): # Skip if None if task_done_offset is None: continue # Commit offsets as the next offset to fetch # which is consistent with the Java Client # task_done is marked by messages consumed, # so add one to mark the next message for fetching commit_offset = (task_done_offset + 1) # Skip if no change from previous committed if commit_offset == self._offsets.commit[topic_partition]: continue commits.append( OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata) ) if commits: logger.info('committing consumer offsets to group %s', self._config['group_id']) resps = self._client.send_offset_commit_request( kafka_bytestring(self._config['group_id']), commits, fail_on_error=False ) for r in resps: check_error(r) topic_partition = (r.topic, r.partition) task_done = self._offsets.task_done[topic_partition] self._offsets.commit[topic_partition] = (task_done + 1) if self._config['auto_commit_enable']: self._reset_auto_commit() return True else: logger.info('No new offsets found to commit in group %s', self._config['group_id']) return False
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages' ) fetches = [FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False ) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning('OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition)) ) continue except NotLeaderForPartitionError: logger.warning("NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug('message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error Arguments: *topics (optional): If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ topics = [kafka_bytestring(t) for t in topics] if topics: for topic in topics: self.reset_topic_metadata(topic) else: self.reset_all_metadata() resp = self.send_metadata_request(topics) log.debug('Updating broker metadata: %s', resp.brokers) log.debug('Updating topic metadata: %s', resp.topics) self.brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions # Errors expected for new topics try: kafka_common.check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error('Error loading topic metadata for %s: %s', topic, type(e)) continue self.topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self.topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: kafka_common.check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self.topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.debug( 'Some (non-leader) replicas not available for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self.brokers: self.topics_to_brokers[topic_part] = self.brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. server could be in bad state) else: self.topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None)
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error Arguments: *topics (optional): If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ topics = [kafka_bytestring(t) for t in topics] if topics: for topic in topics: self.reset_topic_metadata(topic) else: self.reset_all_metadata() resp = self.send_metadata_request(topics) log.debug('Updating broker metadata: %s', resp.brokers) log.debug('Updating topic metadata: %s', resp.topics) self.brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions # Errors expected for new topics try: kafka_common.check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error('Error loading topic metadata for %s: %s', topic, type(e)) continue self.topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self.topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: kafka_common.check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self.topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.debug('Some (non-leader) replicas not available for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self.brokers: self.topics_to_brokers[topic_part] = self.brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. server could be in bad state) else: self.topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None )