def test_encode_fetch_request(self): requests = [ FetchRequest("topic1", 0, 10, 1024), FetchRequest("topic2", 1, 20, 100) ] expect = ('\x00\x00\x00Y\x00\x01\x00\x00\x00\x00\x00\x03\x00\x07' 'client1\xff\xff\xff\xff\x00\x00\x00\x02\x00\x00\x00d\x00' '\x00\x00\x02\x00\x06topic1\x00\x00\x00\x01\x00\x00\x00\x00' '\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x04\x00\x00\x06' 'topic2\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\x00' '\x00\x00\x14\x00\x00\x00d') encoded = KafkaProtocol.encode_fetch_request("client1", 3, requests, 2, 100) self.assertEqual(encoded, expect)
def __iter_partition__(self, partition, offset): """ Iterate over the messages in a partition. Create a FetchRequest to get back a batch of messages, yield them one at a time. After a batch is exhausted, start a new batch unless we've reached the end of this partition. """ # The offset that is stored in the consumer is the offset that # we have consumed. In subsequent iterations, we are supposed to # fetch the next message (that is from the next offset) # However, for the 0th message, the offset should be as-is. # An OffsetFetchRequest to Kafka gives 0 for a new queue. This is # problematic, since 0 is offset of a message which we have not yet # consumed. if self.fetch_started[partition]: offset += 1 fetch_size = self.fetch_min_bytes while True: # use MaxBytes = client's bufsize since we're only # fetching one topic + partition req = FetchRequest( self.topic, partition, offset, self.client.bufsize) (resp,) = self.client.send_fetch_request( [req], max_wait_time=self.fetch_max_wait_time, min_bytes=fetch_size) assert resp.topic == self.topic assert resp.partition == partition next_offset = None try: for message in resp.messages: next_offset = message.offset # update the offset before the message is yielded. This # is so that the consumer state is not lost in certain # cases. # # For eg: the message is yielded and consumed by the # caller, but the caller does not come back into the # generator again. The message will be consumed but the # status will not be updated in the consumer self.fetch_started[partition] = True self.offsets[partition] = message.offset yield message except ConsumerFetchSizeTooSmall, e: fetch_size *= 1.5 log.warn( "Fetch size too small, increasing to %d (1.5x) and retry", fetch_size) continue except ConsumerNoMoreData, e: log.debug("Iteration was ended by %r", e)
def test_consume_none(self): fetch = FetchRequest(self.topic, 0, 0, 1024) (fetch_resp, ) = yield from self.client.send_fetch_request([fetch]) self.assertEquals(fetch_resp.error, 0) self.assertEquals(fetch_resp.topic, self.topic) self.assertEquals(fetch_resp.partition, 0) messages = list(fetch_resp.messages) self.assertEquals(len(messages), 0)
def assert_fetch_offset(self, partition, start_offset, expected_messages): # There should only be one response message from the server. # This will throw an exception if there's more than one. resp, = self.client.send_fetch_request([ FetchRequest(self.bytes_topic, partition, start_offset, 1024) ]) self.assertEqual(resp.error, 0) self.assertEqual(resp.partition, partition) messages = [ x.message.value for x in resp.messages ] self.assertEqual(messages, expected_messages) self.assertEqual(resp.highwaterMark, start_offset+len(expected_messages))
def test_encode_fetch_request(self): requests = [ FetchRequest("topic1", 0, 10, 1024), FetchRequest("topic2", 1, 20, 100), ] header = "".join([ struct.pack('>i', 89), # The length of the message overall struct.pack('>h', 1), # Msg Header, Message type = Fetch struct.pack('>h', 0), # Msg Header, API version struct.pack('>i', 3), # Msg Header, Correlation ID struct.pack('>h7s', 7, "client1"), # Msg Header, The client ID struct.pack('>i', -1), # Replica Id struct.pack('>i', 2), # Max wait time struct.pack('>i', 100), # Min bytes struct.pack('>i', 2), # Num requests ]) topic1 = "".join([ struct.pack('>h6s', 6, 'topic1'), # Topic struct.pack('>i', 1), # Num Payloads struct.pack('>i', 0), # Partition 0 struct.pack('>q', 10), # Offset struct.pack('>i', 1024), # Max Bytes ]) topic2 = "".join([ struct.pack('>h6s', 6, 'topic2'), # Topic struct.pack('>i', 1), # Num Payloads struct.pack('>i', 1), # Partition 0 struct.pack('>q', 20), # Offset struct.pack('>i', 100), # Max Bytes ]) expected1 = "".join([header, topic1, topic2]) expected2 = "".join([header, topic2, topic1]) encoded = KafkaProtocol.encode_fetch_request("client1", 3, requests, 2, 100) self.assertIn(encoded, [expected1, expected2])
def get_timestamp(k, p, current): buffer_size = 1024 responses = k.send_fetch_request([ FetchRequest(p['topic'].encode('utf-8'), p['partition'], current, buffer_size) ]) for resp in responses: for message in resp.messages: if in_array: return json.loads( message.message.value)[array_index][field_name] else: return json.loads(message.message.value)[field_name]
def __iter_partition__(self, partition, offset): """ Iterate over the messages in a partition. Create a FetchRequest to get back a batch of messages, yield them one at a time. After a batch is exhausted, start a new batch unless we've reached the end of this partition. """ # The offset that is stored in the consumer is the offset that # we have consumed. In subsequent iterations, we are supposed to # fetch the next message (that is from the next offset) # However, for the 0th message, the offset should be as-is. # An OffsetFetchRequest to Kafka gives 0 for a new queue. This is # problematic, since 0 is offset of a message which we have not yet # consumed. if self.fetch_started[partition]: offset += 1 while True: # TODO: configure fetch size req = FetchRequest(self.topic, partition, offset, 1024) (resp, ) = self.client.send_fetch_request( [req], max_wait_time=self.fetch_max_wait_time, min_bytes=self.fetch_min_bytes) assert resp.topic == self.topic assert resp.partition == partition next_offset = None for message in resp.messages: next_offset = message.offset # update the offset before the message is yielded. This is # so that the consumer state is not lost in certain cases. # For eg: the message is yielded and consumed by the caller, # but the caller does not come back into the generator again. # The message will be consumed but the status will not be # updated in the consumer self.fetch_started[partition] = True self.offsets[partition] = message.offset yield message if next_offset is None: break else: offset = next_offset + 1
def _fetch(self): # Create fetch request payloads for all the partitions partitions = dict( (p, self.buffer_size) for p in self.fetch_offsets.keys()) while partitions: requests = [] for partition, buffer_size in six.iteritems(partitions): requests.append( FetchRequest(self.topic, partition, self.fetch_offsets[partition], buffer_size)) # Send request responses = self.client.send_fetch_request( requests, max_wait_time=int(self.fetch_max_wait_time), min_bytes=self.fetch_min_bytes) retry_partitions = {} for resp in responses: partition = resp.partition buffer_size = partitions[partition] try: for message in resp.messages: # Put the message in our queue self.queue.put((partition, message)) self.fetch_offsets[partition] = message.offset + 1 except ConsumerFetchSizeTooSmall: if (self.max_buffer_size is not None and buffer_size == self.max_buffer_size): log.error("Max fetch size %d too small", self.max_buffer_size) raise if self.max_buffer_size is None: buffer_size *= 2 else: buffer_size = min(buffer_size * 2, self.max_buffer_size) log.warn( "Fetch size too small, increase to %d (2x) " "and retry", buffer_size) retry_partitions[partition] = buffer_size except ConsumerNoMoreData as e: log.debug("Iteration was ended by %r", e) except StopIteration: # Stop iterating through this partition log.debug("Done iterating over partition %s" % partition) partitions = retry_partitions
def fetch_messages(self): """Sends FetchRequests for all topic/partitions set for consumption Returns: Generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Note: Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy See Also: Key KafkaConsumer configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] if not self._topics: raise KafkaConfigurationError('No topics or partitions configured') if not self._offsets.fetch: raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages') fetches = [ FetchRequest(topic, partition, self._offsets.fetch[(topic, partition)], max_bytes) for (topic, partition) in self._topics ] # send_fetch_request will batch topic/partition requests by leader responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False) for resp in responses: if isinstance(resp, FailedPayloadsError): logger.warning('FailedPayloadsError attempting to fetch data') self._refresh_metadata_on_error() continue topic = kafka_bytestring(resp.topic) partition = resp.partition try: check_error(resp) except OffsetOutOfRangeError: logger.warning( 'OffsetOutOfRange: topic %s, partition %d, ' 'offset %d (Highwatermark: %d)', topic, partition, self._offsets.fetch[(topic, partition)], resp.highwaterMark) # Reset offset self._offsets.fetch[(topic, partition)] = ( self._reset_partition_offset((topic, partition))) continue except NotLeaderForPartitionError: logger.warning( "NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", topic, partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", topic, partition) continue # Track server highwater mark self._offsets.highwater[(topic, partition)] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here val = self._config['deserializer_class'](message.value) msg = KafkaMessage(topic, partition, offset, message.key, val) # in some cases the server will return earlier messages # than we requested. skip them per kafka spec if offset < self._offsets.fetch[(topic, partition)]: logger.debug( 'message offset less than fetched offset ' 'skipping: %s', msg) continue # Only increment fetch offset # if we safely got the message and deserialized self._offsets.fetch[(topic, partition)] = offset + 1 # Then yield to user yield msg
def fetch_messages(self): """ Sends FetchRequests for all topic/partitions set for consumption Returns a generator that yields KafkaMessage structs after deserializing with the configured `deserializer_class` Refreshes metadata on errors, and resets fetch offset on OffsetOutOfRange, per the configured `auto_offset_reset` policy Key configuration parameters: * `fetch_message_max_bytes` * `fetch_max_wait_ms` * `fetch_min_bytes` * `deserializer_class` * `auto_offset_reset` """ max_bytes = self._config['fetch_message_max_bytes'] max_wait_time = self._config['fetch_wait_max_ms'] min_bytes = self._config['fetch_min_bytes'] # Get current fetch offsets offsets = self._offsets.fetch if not offsets: if not self._topics: raise KafkaConfigurationError( 'No topics or partitions configured') raise KafkaConfigurationError( 'No fetch offsets found when calling fetch_messages') fetches = [] for topic_partition, offset in six.iteritems(offsets): fetches.append( FetchRequest(topic_partition[0], topic_partition[1], offset, max_bytes)) # client.send_fetch_request will collect topic/partition requests by leader # and send each group as a single FetchRequest to the correct broker try: responses = self._client.send_fetch_request( fetches, max_wait_time=max_wait_time, min_bytes=min_bytes, fail_on_error=False) except FailedPayloadsError: logger.warning( 'FailedPayloadsError attempting to fetch data from kafka') self._refresh_metadata_on_error() return for resp in responses: topic_partition = (resp.topic, resp.partition) try: check_error(resp) except OffsetOutOfRangeError: logger.warning( 'OffsetOutOfRange: topic %s, partition %d, offset %d ' '(Highwatermark: %d)', resp.topic, resp.partition, offsets[topic_partition], resp.highwaterMark) # Reset offset self._offsets.fetch[ topic_partition] = self._reset_partition_offset( topic_partition) continue except NotLeaderForPartitionError: logger.warning( "NotLeaderForPartitionError for %s - %d. " "Metadata may be out of date", resp.topic, resp.partition) self._refresh_metadata_on_error() continue except RequestTimedOutError: logger.warning("RequestTimedOutError for %s - %d", resp.topic, resp.partition) continue # Track server highwater mark self._offsets.highwater[topic_partition] = resp.highwaterMark # Yield each message # Kafka-python could raise an exception during iteration # we are not catching -- user will need to address for (offset, message) in resp.messages: # deserializer_class could raise an exception here msg = KafkaMessage( resp.topic, resp.partition, offset, message.key, self._config['deserializer_class'](message.value)) # Only increment fetch offset if we safely got the message and deserialized self._offsets.fetch[topic_partition] = offset + 1 # Then yield to user yield msg
def test_fetch_request(self): req = FetchRequest("my-topic", 0, 0, 1024) enc = KafkaClient.encode_fetch_request(req) expect = "\x00\x01\x00\x08my-topic\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00" self.assertEquals(enc, expect)
def _fetch(self): # Create fetch request payloads for all the partitions partitions = dict((p, self.buffer_size) for p in self.fetch_offsets.keys()) while partitions: requests = [] for partition, buffer_size in six.iteritems(partitions): requests.append(FetchRequest(self.topic, partition, self.fetch_offsets[partition], buffer_size)) # Send request responses = self.client.send_fetch_request( requests, max_wait_time=int(self.fetch_max_wait_time), min_bytes=self.fetch_min_bytes, fail_on_error=False ) retry_partitions = {} for resp in responses: try: check_error(resp) except (UnknownTopicOrPartitionError, NotLeaderForPartitionError): self.client.reset_topic_metadata(resp.topic) raise except OffsetOutOfRangeError: log.warning("OffsetOutOfRangeError for %s - %d. " "Resetting partition offset...", resp.topic, resp.partition) self.reset_partition_offset(resp.partition) # Retry this partition retry_partitions[resp.partition] = partitions[resp.partition] continue partition = resp.partition buffer_size = partitions[partition] try: for message in resp.messages: if message.offset < self.fetch_offsets[partition]: log.debug('Skipping message %s because its offset is less than the consumer offset', message) continue # Put the message in our queue self.queue.put((partition, message)) self.fetch_offsets[partition] = message.offset + 1 except ConsumerFetchSizeTooSmall: if (self.max_buffer_size is not None and buffer_size == self.max_buffer_size): log.error("Max fetch size %d too small", self.max_buffer_size) raise if self.max_buffer_size is None: buffer_size *= 2 else: buffer_size = min(buffer_size * 2, self.max_buffer_size) log.warn("Fetch size too small, increase to %d (2x) " "and retry", buffer_size) retry_partitions[partition] = buffer_size except ConsumerNoMoreData as e: log.debug("Iteration was ended by %r", e) except StopIteration: # Stop iterating through this partition log.debug("Done iterating over partition %s" % partition) partitions = retry_partitions