def commit(self, partitions=None): """XXX""" # short circuit if nothing happened. This check is kept outside # to prevent un-necessarily acquiring a lock for checking the state if self._count_since_commit == 0: return with (yield from self._commit_lock): # Do this check again, just in case the state has changed # during the lock acquiring timeout if self._count_since_commit == 0: return reqs = [] if not partitions: # commit all partitions partitions = self._offsets.keys() for partition in partitions: offset = self._offsets[partition] log.debug("Commit offset %d in SimpleConsumer: " "group=%s, topic=%s, partition=%s" % (offset, self._group, self._topic, partition)) reqs.append(OffsetCommitRequest(self._topic, partition, offset, None)) resps = yield from self._client.send_offset_commit_request( self._group, reqs) for resp in resps: check_error(resp) self._count_since_commit = 0
def commit(self, partitions=None): """XXX""" # short circuit if nothing happened. This check is kept outside # to prevent un-necessarily acquiring a lock for checking the state if self._count_since_commit == 0: return with (yield from self._commit_lock): # Do this check again, just in case the state has changed # during the lock acquiring timeout if self._count_since_commit == 0: return reqs = [] if not partitions: # commit all partitions partitions = self._offsets.keys() for partition in partitions: offset = self._offsets[partition] log.debug("Commit offset %d in SimpleConsumer: " "group=%s, topic=%s, partition=%s" % (offset, self._group, self._topic, partition)) reqs.append( OffsetCommitRequest(self._topic, partition, offset, None)) resps = yield from self._client.send_offset_commit_request( self._group, reqs) for resp in resps: check_error(resp) self._count_since_commit = 0
def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets): """ Request available fetch offsets for a single topic/partition Arguments: topic (str) partition (int) request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. max_num_offsets (int) Returns: offsets (list) """ reqs = [ OffsetRequest(topic, partition, request_time_ms, max_num_offsets) ] (resp, ) = self._client.send_offset_request(reqs) check_error(resp) # Just for sanity.. # probably unnecessary assert resp.topic == topic assert resp.partition == partition return resp.offsets
def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets): """Request available fetch offsets for a single topic/partition Keyword Arguments: topic (str): topic for offset request partition (int): partition for offset request request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. max_num_offsets (int): Maximum offsets to include in the OffsetResponse Returns: a list of offsets in the OffsetResponse submitted for the provided topic / partition. See: https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI """ reqs = [OffsetRequest(topic, partition, request_time_ms, max_num_offsets)] (resp,) = self._client.send_offset_request(reqs) check_error(resp) # Just for sanity.. # probably unnecessary assert resp.topic == topic assert resp.partition == partition return resp.offsets
def fetch_last_known_offsets(self, partitions=None): if self.group is None: raise ValueError('KafkaClient.group must not be None') if partitions is None: partitions = self.client.get_partition_ids_for_topic(self.topic) responses = self.client.send_offset_fetch_request( self.group, [OffsetFetchRequest(self.topic, p) for p in partitions], fail_on_error=False ) for resp in responses: try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self.offsets[resp.partition] = 0 # Otherwise we committed the stored offset # and need to fetch the next one else: self.offsets[resp.partition] = resp.offset
def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets): """ Request available fetch offsets for a single topic/partition Arguments: topic (str) partition (int) request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. max_num_offsets (int) Returns: offsets (list) """ reqs = [OffsetRequest(topic, partition, request_time_ms, max_num_offsets)] (resp,) = self._client.send_offset_request(reqs) check_error(resp) # Just for sanity.. # probably unnecessary assert resp.topic == topic assert resp.partition == partition return resp.offsets
def fetch_last_known_offsets(self, partitions=None): if self.group is None: raise ValueError('KafkaClient.group must not be None') if partitions is None: partitions = self.client.get_partition_ids_for_topic(self.topic) responses = self.client.send_offset_fetch_request( self.group, [OffsetFetchRequest(self.topic, p) for p in partitions], fail_on_error=False) for resp in responses: try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self.offsets[resp.partition] = 0 # Otherwise we committed the stored offset # and need to fetch the next one else: self.offsets[resp.partition] = resp.offset
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: resps = [] if self._config['offset_storage'] in ('zookeeper', 'dual'): resps += self._client.send_offset_fetch_request( self._config['group_id'], [ OffsetFetchRequestPayload(topic_partition[0], topic_partition[1]) ], fail_on_error=False) if self._config['offset_storage'] in ('kafka', 'dual'): resps += self._client.send_offset_fetch_request_kafka( self._config['group_id'], [ OffsetFetchRequestPayload(topic_partition[0], topic_partition[1]) ], fail_on_error=False) try: for r in resps: check_error(r) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored max_offset = max(r.offset for r in resps) if max_offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = max_offset
def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets): """Request available fetch offsets for a single topic/partition Keyword Arguments: topic (str): topic for offset request partition (int): partition for offset request request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. max_num_offsets (int): Maximum offsets to include in the OffsetResponse Returns: a list of offsets in the OffsetResponse submitted for the provided topic / partition. See: https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI """ reqs = [ OffsetRequest(topic, partition, request_time_ms, max_num_offsets) ] (resp, ) = self._client.send_offset_request(reqs) check_error(resp) # Just for sanity.. # probably unnecessary assert resp.topic == topic assert resp.partition == partition return resp.offsets
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: resps = [] if self._config['offset_storage'] in ('zookeeper', 'dual'): resps += self._client.send_offset_fetch_request( self._config['group_id'], [OffsetFetchRequestPayload(topic_partition[0], topic_partition[1])], fail_on_error=False) if self._config['offset_storage'] in ('kafka', 'dual'): resps += self._client.send_offset_fetch_request_kafka( self._config['group_id'], [OffsetFetchRequestPayload(topic_partition[0], topic_partition[1])], fail_on_error=False) try: for r in resps: check_error(r) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored max_offset = max(r.offset for r in resps) if max_offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = max_offset
def commit(self): """Store consumed message offsets (marked via task_done()) to kafka cluster for this consumer_group. Returns: True on success, or False if no offsets were found for commit Note: this functionality requires server version >=0.8.1.1 https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI """ if not self._config['group_id']: logger.warning('Cannot commit without a group_id!') raise KafkaConfigurationError('Attempted to commit offsets without a configured consumer group (group_id)') # API supports storing metadata with each commit # but for now it is unused metadata = b'' offsets = self._offsets.task_done commits = [] for topic_partition, task_done_offset in six.iteritems(offsets): # Skip if None if task_done_offset is None: continue # Commit offsets as the next offset to fetch # which is consistent with the Java Client # task_done is marked by messages consumed, # so add one to mark the next message for fetching commit_offset = (task_done_offset + 1) # Skip if no change from previous committed if commit_offset == self._offsets.commit[topic_partition]: continue commits.append(OffsetCommitRequest(topic_partition[0], topic_partition[1], commit_offset, metadata)) if commits: logger.info('committing consumer offsets to group %s', self._config['group_id']) resps = self._client.send_offset_commit_request(kafka_bytestring(self._config['group_id']), commits, fail_on_error=False) for r in resps: check_error(r) topic_partition = (r.topic, r.partition) task_done = self._offsets.task_done[topic_partition] self._offsets.commit[topic_partition] = (task_done + 1) if self._config['auto_commit_enable']: self._reset_auto_commit() return True else: logger.info('No new offsets found to commit in group %s', self._config['group_id']) return False
def _check_commit_response_error(resp): try: check_error(resp) except BrokerResponseError as e: exception = OffsetCommitError( resp.topic, resp.partition, e.message ) return exception
def _check_fetch_response_error(resp): try: check_error(resp) except BrokerResponseError: # In case of error we set the offset to (-1,) return OffsetResponse( resp.topic, resp.partition, resp.error, (-1, ), ) return resp
def _check_fetch_response_error(resp): try: check_error(resp) except BrokerResponseError: # In case of error we set the offset to (-1,) return OffsetResponse( resp.topic, resp.partition, resp.error, (-1,), ) return resp
def _update_group_offsets(self): logger.info("Consumer fetching stored offsets") for partition in self._client.get_partition_ids_for_topic(self._topic): (resp, ) = self._client.send_offset_fetch_request( self._group_id, [OffsetFetchRequest(self._topic, partition)], fail_on_error=False) try: check_error(resp) except UnknownTopicOrPartitionError: pass if resp.offset == -1: self._offsets.commit[partition] = None else: self._offsets.commit[partition] = resp.offset
def commit_partition_offsets(self, partition_offsets): """ Commit explicit partition/offset pairs. """ self.logger.debug("Committing partition offsets: %s", partition_offsets) commit_requests = [ OffsetCommitRequest(self.consumer.topic, partition, offset, None) for partition, offset in partition_offsets.items() ] commit_responses = self.consumer.client.send_offset_commit_request( self.consumer.group, commit_requests, ) for commit_response in commit_responses: check_error(commit_response)
def _update_group_offsets(self): logger.info("Consumer fetching stored offsets") for partition in self._client.get_partition_ids_for_topic(self._topic): (resp,) = self._client.send_offset_fetch_request( self._group_id, [OffsetFetchRequest(self._topic, partition)], fail_on_error=False) try: check_error(resp) except UnknownTopicOrPartitionError: pass if resp.offset == -1: self._offsets.commit[partition] = None else: self._offsets.commit[partition] = resp.offset
def _update_produced_offsets(self): """ Arguments: request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. """ for partition in self._client.get_partition_ids_for_topic(self._topic): reqs = [OffsetRequest(self._topic, partition, -1, 1)] (resp,) = self._client.send_offset_request(reqs) check_error(resp) assert resp.topic == self._topic assert resp.partition == partition self._offsets.produced[partition] = resp.offsets[0]
def pluck_topic_offset_or_zero_on_unknown(resp): try: check_error(resp) except UnknownTopicOrPartitionError: # If the server doesn't have any commited offsets by this group for # this topic, assume it's zero. pass # The API spec says server wont set an error, but 0.8.1.1 does. The actual # check is if the offset is -1. if resp.offset == -1: return OffsetFetchResponse( resp.topic, resp.partition, 0, resp.metadata, 0, ) return resp
def _update_produced_offsets(self): """ Arguments: request_time_ms (int): Used to ask for all messages before a certain time (ms). There are two special values. Specify -1 to receive the latest offset (i.e. the offset of the next coming message) and -2 to receive the earliest available offset. Note that because offsets are pulled in descending order, asking for the earliest offset will always return you a single element. """ for partition in self._client.get_partition_ids_for_topic(self._topic): reqs = [OffsetRequest(self._topic, partition, -1, 1)] (resp, ) = self._client.send_offset_request(reqs) check_error(resp) assert resp.topic == self._topic assert resp.partition == partition self._offsets.produced[partition] = resp.offsets[0]
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: (resp, ) = self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = resp.offset
def _get_commit_offsets(self): logger.info("Consumer fetching stored offsets") for topic_partition in self._topics: (resp,) = self._client.send_offset_fetch_request( kafka_bytestring(self._config['group_id']), [OffsetFetchRequest(topic_partition[0], topic_partition[1])], fail_on_error=False) try: check_error(resp) # API spec says server wont set an error here # but 0.8.1.1 does actually... except UnknownTopicOrPartitionError: pass # -1 offset signals no commit is currently stored if resp.offset == -1: self._offsets.commit[topic_partition] = None # Otherwise we committed the stored offset # and need to fetch the next one else: self._offsets.commit[topic_partition] = resp.offset
def reset_partition_offset(self, partition): LATEST = -1 EARLIEST = -2 if self.auto_offset_reset == 'largest': reqs = [OffsetRequest(self.topic, partition, LATEST, 1)] elif self.auto_offset_reset == 'smallest': reqs = [OffsetRequest(self.topic, partition, EARLIEST, 1)] else: # Let's raise an reasonable exception type if user calls # outside of an exception context if sys.exc_info() == (None, None, None): raise OffsetOutOfRangeError('Cannot reset partition offsets without a ' 'valid auto_offset_reset setting ' '(largest|smallest)') # Otherwise we should re-raise the upstream exception # b/c it typically includes additional data about # the request that triggered it, and we do not want to drop that raise # send_offset_request (resp, ) = self.client.send_offset_request(reqs) check_error(resp) self.offsets[partition] = resp.offsets[0] self.fetch_offsets[partition] = resp.offsets[0]
def _check_commit_response_error(resp): try: check_error(resp) except BrokerResponseError as e: exception = OffsetCommitError(resp.topic, resp.partition, e.message) return exception
def _fetch(self): # Create fetch request payloads for all the partitions partitions = dict( (p, self.buffer_size) for p in self.fetch_offsets.keys()) while partitions: requests = [] for partition, buffer_size in six.iteritems(partitions): requests.append( FetchRequestPayload(self.topic, partition, self.fetch_offsets[partition], buffer_size)) # Send request responses = self.client.send_fetch_request( requests, max_wait_time=int(self.fetch_max_wait_time), min_bytes=self.fetch_min_bytes, fail_on_error=False) retry_partitions = {} for resp in responses: try: check_error(resp) except UnknownTopicOrPartitionError: log.error('UnknownTopicOrPartitionError for %s:%d', resp.topic, resp.partition) self.client.reset_topic_metadata(resp.topic) raise except NotLeaderForPartitionError: log.error('NotLeaderForPartitionError for %s:%d', resp.topic, resp.partition) self.client.reset_topic_metadata(resp.topic) continue except OffsetOutOfRangeError: log.warning( 'OffsetOutOfRangeError for %s:%d. ' 'Resetting partition offset...', resp.topic, resp.partition) self.reset_partition_offset(resp.partition) # Retry this partition retry_partitions[resp.partition] = partitions[ resp.partition] continue except FailedPayloadsError as e: log.warning('FailedPayloadsError for %s:%d', e.payload.topic, e.payload.partition) # Retry this partition retry_partitions[e.payload.partition] = partitions[ e.payload.partition] continue partition = resp.partition buffer_size = partitions[partition] # Check for partial message if resp.messages and isinstance(resp.messages[-1].message, PartialMessage): # If buffer is at max and all we got was a partial message # raise ConsumerFetchSizeTooSmall if (self.max_buffer_size is not None and buffer_size == self.max_buffer_size and len(resp.messages) == 1): log.error('Max fetch size %d too small', self.max_buffer_size) raise ConsumerFetchSizeTooSmall() if self.max_buffer_size is None: buffer_size *= 2 else: buffer_size = min(buffer_size * 2, self.max_buffer_size) log.warning( 'Fetch size too small, increase to %d (2x) ' 'and retry', buffer_size) retry_partitions[partition] = buffer_size resp.messages.pop() for message in resp.messages: if message.offset < self.fetch_offsets[partition]: log.debug( 'Skipping message %s because its offset is less than the consumer offset', message) continue # Put the message in our queue self.queue.put((partition, message)) self.fetch_offsets[partition] = message.offset + 1 partitions = retry_partitions
def _raise_on_response_error(self, resp): try: check_error(resp) except (UnknownTopicOrPartitionError, NotLeaderForPartitionError): self.reset_topic_metadata(resp.topic) raise
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages') fetches = [ FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics ] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning( 'OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition))) continue except NotLeaderForPartitionError: logger.warning( "NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug( 'message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def load_metadata_for_topics(self, *topics): """ Fetch broker and topic-partition metadata from the server, and update internal data: broker list, topic/partition list, and topic/parition -> broker map This method should be called after receiving any error @param: *topics (optional) If a list of topics is provided, the metadata refresh will be limited to the specified topics only. Exceptions: ---------- If the broker is configured to not auto-create topics, expect UnknownTopicOrPartitionError for topics that don't exist If the broker is configured to auto-create topics, expect LeaderNotAvailableError for new topics until partitions have been initialized. Exceptions *will not* be raised in a full refresh (i.e. no topic list) In this case, error codes will be logged as errors Partition-level errors will also not be raised here (a single partition w/o a leader, for example) """ resp = yield from self.send_metadata_request(topics) log.debug("Broker metadata: %s", resp.brokers) log.debug("Topic metadata: %s", resp.topics) self._brokers = dict([(broker.nodeId, broker) for broker in resp.brokers]) for topic_metadata in resp.topics: topic = topic_metadata.topic partitions = topic_metadata.partitions self.reset_topic_metadata(topic) # Errors expected for new topics try: check_error(topic_metadata) except (UnknownTopicOrPartitionError, LeaderNotAvailableError) as e: # Raise if the topic was passed in explicitly if topic in topics: raise # Otherwise, just log a warning log.error("Error loading topic metadata for %s: %s", topic, type(e)) continue self._topic_partitions[topic] = {} for partition_metadata in partitions: partition = partition_metadata.partition leader = partition_metadata.leader self._topic_partitions[topic][partition] = partition_metadata # Populate topics_to_brokers dict topic_part = TopicAndPartition(topic, partition) # Check for partition errors try: check_error(partition_metadata) # If No Leader, topics_to_brokers topic_partition -> None except LeaderNotAvailableError: log.error('No leader for topic %s partition %d', topic, partition) self._topics_to_brokers[topic_part] = None continue # If one of the replicas is unavailable -- ignore # this error code is provided for admin purposes only # we never talk to replicas, only the leader except ReplicaNotAvailableError: log.warning('Some (non-leader) replicas not available ' 'for topic %s partition %d', topic, partition) # If Known Broker, topic_partition -> BrokerMetadata if leader in self._brokers: self._topics_to_brokers[topic_part] = self._brokers[leader] # If Unknown Broker, fake BrokerMetadata so we dont lose the id # (not sure how this could happen. # server could be in bad state) else: self._topics_to_brokers[topic_part] = BrokerMetadata( leader, None, None )
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages' ) fetches = [FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False ) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning('OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition)) ) continue except NotLeaderForPartitionError: logger.warning("NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug('message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def fetch_messages(self): """ Sends FetchRequests for all topic/partitions set for consumption Returns a generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy Key configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] # Get current fetch offsets offsets = self._offsets.fetch if not offsets: if not self._topics: raise KafkaConfigurationError( 'No topics or partitions configured') raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages') fetches = [] for topic_partition, offset in six.iteritems(offsets): fetches.append( FetchRequest(topic_partition[0], topic_partition[1], offset, max_bytes)) # client.send_fetch_request will collect topic/partition requests by leader # and send each group as a single FetchRequest to the correct broker try: responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False) except FailedPayloadsError: logger.warning( 'FailedPayloadsError attempting to fetch data from kafka') self._refresh_metadata_on_error() return for resp in responses: topic_partition = (resp.topic, resp.partition) try: check_error(resp) except OffsetOutOfRangeError: logger.warning( 'OffsetOutOfRange: topic %s, partition %d, offset %d ' '(Highwatermark: %d)', resp.topic, resp.partition, offsets[topic_partition], resp.highwaterMark) # Reset offset self._offsets.fetch[ topic_partition] = self._reset_partition_offset( topic_partition) continue except NotLeaderForPartitionError: logger.warning( "NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", resp.topic, resp.partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", resp.topic, resp.partition) continue # Track server highwater mark self._offsets.highwater[topic_partition] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here msg = KafkaMessage( resp.topic, resp.partition, offset, message.key, self._config['deserializer_class'](message.value)) # Only increment fetch offset if we safely got the message and deserialized self._offsets.fetch[topic_partition] = offset + 1 # Then yield to user yield msg
def _fetch(self): # Create fetch request payloads for all the partitions partitions = dict((p, self.buffer_size) for p in self.fetch_offsets.keys()) while partitions: requests = [] for partition, buffer_size in six.iteritems(partitions): requests.append(FetchRequest(self.topic, partition, self.fetch_offsets[partition], buffer_size)) # Send request responses = self.client.send_fetch_request( requests, max_wait_time=int(self.fetch_max_wait_time), min_bytes=self.fetch_min_bytes, fail_on_error=False ) retry_partitions = {} for resp in responses: try: check_error(resp) except (UnknownTopicOrPartitionError, NotLeaderForPartitionError): self.client.reset_topic_metadata(resp.topic) raise except OffsetOutOfRangeError: log.warning("OffsetOutOfRangeError for %s - %d. " "Resetting partition offset...", resp.topic, resp.partition) self.reset_partition_offset(resp.partition) # Retry this partition retry_partitions[resp.partition] = partitions[resp.partition] continue partition = resp.partition buffer_size = partitions[partition] try: for message in resp.messages: if message.offset < self.fetch_offsets[partition]: log.debug('Skipping message %s because its offset is less than the consumer offset', message) continue # Put the message in our queue self.queue.put((partition, message)) self.fetch_offsets[partition] = message.offset + 1 except ConsumerFetchSizeTooSmall: if (self.max_buffer_size is not None and buffer_size == self.max_buffer_size): log.error("Max fetch size %d too small", self.max_buffer_size) raise if self.max_buffer_size is None: buffer_size *= 2 else: buffer_size = min(buffer_size * 2, self.max_buffer_size) log.warn("Fetch size too small, increase to %d (2x) " "and retry", buffer_size) retry_partitions[partition] = buffer_size except ConsumerNoMoreData as e: log.debug("Iteration was ended by %r", e) except StopIteration: # Stop iterating through this partition log.debug("Done iterating over partition %s" % partition) partitions = retry_partitions
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] # Get current fetch offsets offsets = self._offsets.fetch if not offsets: if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') raise KafkaConfigurationError('No fetch offsets found when calling fetch_messages') fetches = [] for topic_partition, offset in six.iteritems(offsets): fetches.append(FetchRequest(topic_partition[0], topic_partition[1], offset, max_bytes)) # client.send_fetch_request will collect topic/partition requests by leader # and send each group as a single FetchRequest to the correct broker try: responses = self._client.send_fetch_request(fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False) except FailedPayloadsError: logger.warning('FailedPayloadsError attempting to fetch data from kafka') self._refresh_metadata_on_error() return for resp in responses: topic_partition = (resp.topic, resp.partition) try: check_error(resp) except OffsetOutOfRangeError: logger.warning('OffsetOutOfRange: topic %s, partition %d, offset %d ' '(Highwatermark: %d)', resp.topic, resp.partition, offsets[topic_partition], resp.highwaterMark) # Reset offset self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition) continue except NotLeaderForPartitionError: logger.warning("NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", resp.topic, resp.partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", resp.topic, resp.partition) continue # Track server highwater mark self._offsets.highwater[topic_partition] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here msg = KafkaMessage(resp.topic, resp.partition, offset, message.key, self._config['deserializer_class'](message.value)) if offset < self._offsets.fetch[topic_partition]: logger.debug('Skipping message %s because its offset is less than the consumer offset', msg) continue # Only increment fetch offset if we safely got the message and deserialized self._offsets.fetch[topic_partition] = offset + 1 # Then yield to user yield msg