def test_consumer_commit_offsets(self): # Start off by sending messages before the consumer is started yield self.send_messages(self.partition, range(0, 100)) # Create a consumer, allow commit, disable auto-commit consumer = self.consumer(consumer_group=self.id(), auto_commit_every_n=0, auto_commit_every_ms=0) # Check for messages on the processor self.assertFalse(consumer.processor._messages) # Start the consumer from the beginning start_d = consumer.start(OFFSET_EARLIEST) # Send some more messages yield self.send_messages(self.partition, range(100, 200)) # Loop waiting for all the messages to show up while len(consumer.processor._messages) < 200: # Wait a bit for them to arrive yield async_delay() # Make sure we got all 200 self.assertEqual(len(consumer.processor._messages), 200) # Stop the consumer consumer.stop() self.successResultOf(start_d) # Commit the offsets yield consumer.commit() # Send some more messages last_batch = yield self.send_messages(self.partition, range(200, 300)) # Create another consumer consumer2 = self.consumer(consumer_group=self.id(), auto_commit_every_n=0, auto_commit_every_ms=0) # Start it at the last offset for the group start_d2 = consumer2.start(OFFSET_COMMITTED) # Loop waiting for all the messages to show up while len(consumer2.processor._messages) < 100: # Wait a bit for them to arrive yield async_delay() # Make sure we got all 100, and the right 100 self.assertEqual(len(consumer2.processor._messages), 100) self.assertEqual( last_batch, [x.message.value for x in consumer2.processor._messages]) # Stop the consumer consumer2.stop() self.successResultOf(start_d2)
def test_consumer_restart(self): sent_messages = yield self.send_messages(self.partition, range(0, 100)) # Create & start our default consumer (auto-commit) consumer = self.consumer() # Check for messages on the processor self.assertFalse(consumer.processor._messages) # Start the consumer from the beginning start_d = consumer.start(OFFSET_EARLIEST) # Send some more messages sent_messages += yield self.send_messages( self.partition, range(100, 200)) # Loop waiting for all the messages to show up while len(consumer.processor._messages) < 200: # Wait a bit for them to arrive yield async_delay() # Make sure we got all 200 self.assertEqual(len(consumer.processor._messages), 200) # Stop the consumer and record offset at which to restart (next after # last processed message offset) offset = consumer.stop() + 1 self.successResultOf(start_d) # Send some more messages sent_messages += yield self.send_messages( self.partition, range(200, 250)) # Restart the consumer at the returned offset start_d2 = consumer.start(offset) # Loop waiting for the new message while len(consumer.processor._messages) < 250: # Wait a bit for them to arrive yield async_delay() # make sure we got them all self.assert_message_count(consumer.processor._messages, 250) expected_messages = set(sent_messages) actual_messages = set([x.message.value for x in consumer.processor._messages]) self.assertEqual(expected_messages, actual_messages) # Clean up consumer.stop() self.successResultOf(start_d2)
def test_huge_messages(self): # Produce 10 "normal" size messages yield self.send_messages(0, [str(x) for x in range(10)]) # Setup a max buffer size for the consumer, and put a message in # Kafka that's bigger than that MAX_FETCH_BUFFER_SIZE_BYTES = (256 * 1024) - 10 huge_message, = yield self.send_messages( 0, [random_string(MAX_FETCH_BUFFER_SIZE_BYTES + 10)]) # Create a consumer with the (smallish) max buffer size consumer = self.consumer(max_buffer_size=MAX_FETCH_BUFFER_SIZE_BYTES) # This consumer fails to get the message, and errbacks the start # deferred d = consumer.start(OFFSET_EARLIEST) # Loop waiting for the errback to be called while not d.called: # Wait a bit for them to arrive yield async_delay() # Make sure the failure is as expected self.failureResultOf(d, ConsumerFetchSizeTooSmall) # Make sure the smaller, earlier messages were delivered self.assert_message_count(consumer.processor._messages, 10) # last offset seen last_offset = consumer.processor._messages[-1].offset # Stop the consumer: d already errbacked, but stop still must be called consumer.stop() # Create a consumer with no fetch size limit big_consumer = self.consumer() # Start just past the last message processed d = big_consumer.start(last_offset + 1) # Consume giant message successfully while not big_consumer.processor._messages: # Wait a bit for it to arrive yield async_delay() self.assertEqual(big_consumer.processor._messages[0].message.value, huge_message) # Clean up big_consumer.stop() self.successResultOf(d)
def test_large_messages(self): # Produce 10 "normal" size messages small_messages = yield self.send_messages( 0, [str(x) for x in range(10)]) # Produce 10 messages that are large (bigger than default fetch size) large_messages = yield self.send_messages( partition=0, messages=[random_string(FETCH_BUFFER_SIZE_BYTES * 3) for x in range(10)], ) # Consumer should still get all of them consumer = self.consumer() # Start the consumer from the beginning d = consumer.start(OFFSET_EARLIEST) # Loop waiting for all the messages to show up while len(consumer.processor._messages) < 20: # Wait a bit for them to arrive yield async_delay() expected_messages = set(small_messages + large_messages) actual_messages = set([x.message.value for x in consumer.processor._messages]) self.assertEqual(expected_messages, actual_messages) # Clean up consumer.stop() self.successResultOf(d)
def _count_messages(self, topic): log.debug("Counting messages on topic %s", topic) messages = [] client = KafkaClient(self.harness.bootstrap_hosts, clientId="CountMessages", timeout=500, reactor=self.reactor) try: yield ensure_topic_creation(client, topic, fully_replicated=False) # Need to retry this until we have a leader... while True: # Ask the client to load the latest metadata. This may avoid a # NotLeaderForPartitionError I was seeing upon re-start of the # broker. yield client.load_metadata_for_topics(topic) # if there is an error on the metadata for the topic, wait errno = client.metadata_error_for_topic(topic) if errno == 0: break else: log.debug("Topic %s in error errno=%d", topic, errno) yield async_delay(1.0) # Ok, should be safe to get the partitions now... partitions = client.topic_partitions[topic] requests = [ FetchRequest(topic, part, 0, 1024 * 1024) for part in partitions ] resps = [] while not resps: try: log.debug("_count_message: Fetching messages") resps = yield client.send_fetch_request(requests, max_wait_time=400) except (NotLeaderForPartitionError, UnknownTopicOrPartitionError, KafkaUnavailableError): # pragma: no cover log.debug("_count_message: Metadata err, retrying...") yield client.load_metadata_for_topics(topic) except FailedPayloadsError as e: # pragma: no cover if not e.args[1][0][1].check(RequestTimedOutError): raise log.debug("_count_message: Timed out err, retrying...") finally: yield client.close() for fetch_resp in resps: messages.extend(list(fetch_resp.messages)) log.debug("Got %d messages: %r", len(messages), messages) returnValue(len(messages))
def wait_for_assignments(topic, partitions, members): deadline = Deadline() while True: try: assert_assignments( topic, partitions, members, ) break except AssertionError: deadline.check() yield async_delay(0.5)
def test_switch_leader(self): """ Produce messages while killing the coordinator broker. Note that in order to avoid loss of acknowledged writes the producer must request acks of -1 (`afkak.common.PRODUCER_ACK_ALL_REPLICAS`). """ producer = Producer( self.client, req_acks=PRODUCER_ACK_ALL_REPLICAS, max_req_attempts=100, ) topic = self.topic try: for index in range(1, 3): # cause the client to establish connections to all the brokers log.debug("Pass: %d. Sending 10 random messages", index) yield self._send_random_messages(producer, topic, 10) # kill leader for partition 0 log.debug("Killing leader of partition 0") broker, kill_time = self._kill_leader(topic, 0) log.debug("Sending 1 more message: 'part 1'") yield producer.send_messages(topic, msgs=[b'part 1']) log.debug("Sending 1 more message: 'part 2'") yield producer.send_messages(topic, msgs=[b'part 2']) # send to new leader log.debug("Sending 10 more messages") yield self._send_random_messages(producer, topic, 10) # Make sure the ZK ephemeral time (~6 seconds) has elapsed wait_time = (kill_time + 6.5) - time.time() if wait_time > 0: log.debug("Waiting: %4.2f for ZK timeout", wait_time) yield async_delay(wait_time) # restart the kafka broker log.debug("Restarting leader broker %r", broker) broker.restart() # count number of messages log.debug("Getting message count") count = yield self._count_messages(topic) self.assertGreaterEqual(count, 22 * index) finally: log.debug("Stopping the producer") yield producer.stop() log.debug("Producer stopped") log.debug("Test complete.")
def test_consumer(self): yield self.send_messages(self.partition, range(0, 100)) # Create a consumer. consumer = self.consumer() # Check for messages on the processor self.assertFalse(consumer.processor._messages) # Start the consumer from the beginning start_d = consumer.start(OFFSET_EARLIEST) # Send some more messages yield self.send_messages(self.partition, range(100, 200)) # Loop waiting for all the messages to show up while len(consumer.processor._messages) < 200: # Wait a bit for them to arrive yield async_delay() # Make sure we got all 200 self.assertEqual(len(consumer.processor._messages), 200) # Send some more messages yield self.send_messages(self.partition, range(200, 250)) # Loop waiting for the new message while len(consumer.processor._messages) < 250: # Wait a bit for them to arrive yield async_delay() # make sure we got them all self.assert_message_count(consumer.processor._messages, 250) # Clean up consumer.stop() self.successResultOf(start_d)
def ensure_topic_creation(client, topic_name, fully_replicated=True, timeout=5): ''' With the default Kafka configuration, just querying for the metadata for a particular topic will auto-create that topic. :param client: `afkak.client.KafkaClient` instance :param str topic_name: Topic name :param bool fully_replicated: If ``True``, check whether all partitions for the topic have been assigned brokers. This doesn't ensure that producing to the topic will succeed, though—there is a window after the partition is assigned before the broker can actually accept writes. In this case the broker will respond with a retriable error (see `IntegrationMixin.retry_broker_errors()`). If ``False``, only check that any metadata exists for the topic. :param timeout: Number of seconds to wait. ''' start_time = time.time() if fully_replicated: check_func = client.topic_fully_replicated else: check_func = client.has_metadata_for_topic yield client.load_metadata_for_topics(topic_name) def topic_info(): if topic_name in client.topic_partitions: return "Topic {} exists. Partition metadata: {}".format( topic_name, pformat([client.partition_meta[TopicAndPartition(topic_name, part)] for part in client.topic_partitions[topic_name]]), ) else: return "No metadata for topic {} found.".format(topic_name) while not check_func(topic_name): yield async_delay(clock=client.reactor) if time.time() > start_time + timeout: raise Exception(( "Timed out waiting topic {} creation after {} seconds. {}" ).format(topic_name, timeout, topic_info())) else: log.debug('Still waiting topic creation: %s.', topic_info()) yield client.load_metadata_for_topics(topic_name) log.info('%s', topic_info())
def test_producer_batched_gzipped_hashed_partitioner(self): start_offset0 = yield self.current_offset(self.topic, 0) start_offset1 = yield self.current_offset(self.topic, 1) offsets = (start_offset0, start_offset1) requests = [] msgs_by_partition = ([], []) keys_by_partition = ([], []) partitioner = HashedPartitioner(self.topic, [0, 1]) producer = Producer(self.client, codec=CODEC_GZIP, batch_send=True, batch_every_n=100, batch_every_t=None, partitioner_class=HashedPartitioner) # Send ten groups of messages, each with a different key for i in range(10): msg_group = [] key = 'Key: {}'.format(i).encode() part = partitioner.partition(key, [0, 1]) for j in range(10): msg = self.msg('Group:{} Msg:{}'.format(i, j)) msg_group.append(msg) msgs_by_partition[part].append(msg) keys_by_partition[part].append(key) request = producer.send_messages(self.topic, key=key, msgs=msg_group) requests.append(request) yield async_delay(.5) # Make the NoResult test have teeth... if i < 9: # This is to ensure we really are batching all the requests self.assertNoResult(request) # Now ensure we can retrieve the right messages from each partition for part in [0, 1]: yield self.assert_fetch_offset(part, offsets[part], msgs_by_partition[part], keys_by_partition[part], fetch_size=20480) yield producer.stop()
def retry_while_broker_errors(self, f, *a, **kw): """ Call a function, retrying on retriable broker errors. If calling the function fails with one of these exception types it is called again after a short delay: * `afkak.common.RetriableBrokerResponseError` (or a subclass thereof) * `afkak.common.PartitionUnavailableError` The net effect is to keep trying until topic auto-creation completes. :param f: callable, which may return a `Deferred` :param a: arbitrary positional arguments :param kw: arbitrary keyword arguments """ while True: try: returnValue((yield f(*a, **kw))) break except (RetriableBrokerResponseError, PartitionUnavailableError): yield async_delay(0.1, clock=self.reactor)
def test_producer_batched_by_time(self): start_offset0 = yield self.current_offset(self.topic, 0) start_offset1 = yield self.current_offset(self.topic, 1) # This needs to be big enough that the operations between starting the # producer and the sleep take less time than this... I made # it large enough that the test would still pass even with my Macbook's # cores all pegged by a load generator. batchtime = 5 try: producer = Producer(self.client, batch_send=True, batch_every_n=0, batch_every_t=batchtime) startTime = time.time() # Send 4 messages and do a fetch send1D = producer.send_messages(self.topic, msgs=[ self.msg("one"), self.msg("two"), self.msg("three"), self.msg("four") ]) # set assert_fetch_offset() to wait for 0.1 secs on the server-side # before returning no result. So, these calls should take 0.2sec yield self.assert_fetch_offset(0, start_offset0, [], max_wait=0.1) yield self.assert_fetch_offset(1, start_offset1, [], max_wait=0.1) # Messages shouldn't have sent out yet, so we shouldn't have # response from server yet on having received/responded to request self.assertNoResult(send1D) # Sending 3 more messages should NOT trigger the send, as less than # 1 sec. elapsed by here, so send2D should still have no result. send2D = producer.send_messages( self.topic, msgs=[self.msg("five"), self.msg("six"), self.msg("seven")]) # still no messages... yield self.assert_fetch_offset(0, start_offset0, [], max_wait=0.1) yield self.assert_fetch_offset(1, start_offset1, [], max_wait=0.1) # Still no result on send, and send should NOT have gone out. self.assertNoResult(send2D) # Wait the timeout out. It'd be nicer to be able to just 'advance' # the reactor, but since we need the network so... yield async_delay(batchtime - (time.time() - startTime) + 0.05, clock=self.reactor) # We need to yield to the reactor to have it process the response # from the broker. Both send1D and send2D should then have results. resp1 = yield send1D resp2 = yield send2D # ensure the 2 batches went into the proper partitions... self.assert_produce_response(resp1, start_offset0) self.assert_produce_response(resp2, start_offset1) # Should be able to get messages now yield self.assert_fetch_offset(0, start_offset0, [ self.msg("one"), self.msg("two"), self.msg("three"), self.msg("four") ]) yield self.assert_fetch_offset( 1, start_offset1, [self.msg("five"), self.msg("six"), self.msg("seven")]) finally: yield producer.stop()
def test_consumer_rejoin(self): """ trigger a rejoin via consumer commit failure """ group = 'rejoin_group' self.client2 = KafkaClient(self.harness.bootstrap_hosts, clientId=self.topic + '2') self.addCleanup(self.client2.close) record_stream = DeferredQueue(backlog=1) def processor(consumer, records): log.debug('processor(%r, %r)', consumer, records) record_stream.put(records) coord = ConsumerGroup( self.client, group, topics=[self.topic], processor=processor, session_timeout_ms=6000, retry_backoff_ms=100, heartbeat_interval_ms=1000, fatal_backoff_ms=3000, consumer_kwargs=dict(auto_commit_every_ms=1000), ) coord_start_d = coord.start() self.addCleanup(coord.stop) # FIXME: This doesn't seem to get fired reliably. coord_start_d # self.addCleanup(lambda: coord_start_d) yield wait_for_assignments(self.topic, self.num_partitions, [coord]) # kill the heartbeat timer and start joining the second consumer while True: if coord._heartbeat_looper.running: coord._heartbeat_looper.stop() break else: yield async_delay() coord2 = ConsumerGroup( self.client2, group, topics=[self.topic], processor=processor, session_timeout_ms=6000, retry_backoff_ms=100, heartbeat_interval_ms=1000, fatal_backoff_ms=3000, consumer_kwargs=dict(auto_commit_every_ms=1000), ) coord2_start_d = coord2.start() self.addCleanup(coord2.stop) # FIXME: This doesn't seem to get fired reliably. coord2_start_d # self.addCleanup(lambda: coord2_start_d) # send some messages and see that they're processed # the commit will eventually fail because we're rebalancing for part in range(15): yield async_delay() values = yield self.send_messages(part % self.num_partitions, [part]) msgs = yield record_stream.get() if msgs[0].partition != part: # once the commit fails, we will see the msg twice break self.assertEqual(msgs[0].message.value, values[0]) yield wait_for_assignments(self.topic, self.num_partitions, [coord, coord2]) # Once assignments have been received we need to ensure that the record # stream is clear of any duplicate messages. We do this by producing # a sentinel to each partition and consuming messages from the stream # until all the sentinels have appeared at least once. At that point # any churn should have cleared up and we can depend on lock-step # delivery. pending_sentinels = {} for part in range(self.num_partitions): [value] = yield self.send_messages(part, ['sentinel']) pending_sentinels[part] = value while pending_sentinels: [message] = yield record_stream.get() if pending_sentinels.get( message.partition) == message.message.value: del pending_sentinels[message.partition] # after the cluster has re-formed, send some more messages # and check that we get them too (and don't get the old messages again) record_stream = DeferredQueue(backlog=1) for part in range(self.num_partitions): yield async_delay() [value] = yield self.send_messages(part, [part]) log.debug('waiting for messages from partition %d', part) [message] = yield record_stream.get() self.assertEqual(message.partition, part) self.assertEqual(message.message.value, value)
def test_throughput(self): # Flag to shutdown keep_running = True # Count of messages sent sent_msgs_count = [0] total_messages_size = [0] # setup MESSAGE_BLOCK_SIZEx1024-ish byte messages to send over and over constant_messages = [ self.msg(s) for s in [random_string(1024) for x in range(MESSAGE_BLOCK_SIZE)] ] large_messages = [ self.msg(s) for s in [ random_string(FETCH_BUFFER_SIZE_BYTES * 3) for x in range(MESSAGE_BLOCK_SIZE) ] ] constant_messages_size = len(constant_messages[0]) * MESSAGE_BLOCK_SIZE large_messages_size = len(large_messages[0]) * MESSAGE_BLOCK_SIZE # Create a producer and send some messages producer = Producer(self.client) # Create consumers (1/partition) consumers = [ self.consumer(partition=p, fetch_max_wait_time=50) for p in range(PARTITION_COUNT) ] def log_error(failure): log.exception("Failure sending messages: %r", failure) # pragma: no cover def sent_msgs(resps): log.info("Messages Sent: %r", resps) sent_msgs_count[0] += MESSAGE_BLOCK_SIZE return resps def send_msgs(): # randomly, 1/20 of the time, send large messages if randint(0, 19): messages = constant_messages large = '' total_messages_size[0] += constant_messages_size else: messages = large_messages large = ' large' total_messages_size[0] += large_messages_size log.info("Sending: %d%s messages", len(messages), large) d = producer.send_messages(self.topic, msgs=messages) # As soon as we get a response from the broker, count them # and if we're still supposed to, send more d.addCallback(sent_msgs) if keep_running: d.addCallback(lambda _: self.reactor.callLater(0, send_msgs)) # d.addCallback(lambda _: send_msgs()) d.addErrback(log_error) # Start sending messages, MESSAGE_BLOCK_SIZE at a time, 1K or 384K each send_msgs() # Start the consumers from the beginning fetch_start = time.time() start_ds = [consumer.start(OFFSET_EARLIEST) for consumer in consumers] # Let them all run for awhile... log.info("Waiting %d seconds...", PRODUCE_TIME) yield async_delay(PRODUCE_TIME) # Tell the producer to stop keep_running = False # Wait up to PRODUCE_TIME for the consumers to catch up log.info( "Waiting up to %d seconds for consumers to finish consuming...", PRODUCE_TIME) deadline = time.time() + PRODUCE_TIME * 2 while time.time() < deadline: consumed = sum( [len(consumer.processor._messages) for consumer in consumers]) log.debug("Consumed %d messages.", consumed) if sent_msgs_count[0] == consumed: break yield async_delay(1) fetch_time = time.time() - fetch_start consumed_bytes = sum( [c.processor._messages_bytes[0] for c in consumers]) result_msg = ("Sent: {} messages ({:,} total bytes) in ~{} seconds" " ({}/sec), Consumed: {} in {:.2f} seconds.").format( sent_msgs_count[0], total_messages_size[0], PRODUCE_TIME, sent_msgs_count[0] / PRODUCE_TIME, consumed, fetch_time) # Log the result, and print to stderr to get around nose capture log.info(result_msg) print("\n\t Performance Data: " + result_msg, file=sys.stderr) # And print data as stats stat('Production_Time', PRODUCE_TIME) stat('Consumption_Time', fetch_time) stat('Messages_Produced', sent_msgs_count[0]) stat('Messages_Consumed', consumed) stat('Messages_Bytes_Produced', total_messages_size[0]) stat('Messages_Bytes_Consumed', consumed_bytes) stat('Messages_Produced_Per_Second', sent_msgs_count[0] / PRODUCE_TIME) stat('Messages_Consumed_Per_Second', consumed / fetch_time) stat('Message_Bytes_Produced_Per_Second', total_messages_size[0] / PRODUCE_TIME) stat('Message_Bytes_Consumed_Per_Second', consumed_bytes / fetch_time) # Clean up log.debug('Stopping producer: %r', producer) yield producer.stop() log.debug('Stopping consumers: %r', consumers) for consumer in consumers: consumer.stop() for start_d in start_ds: self.successResultOf(start_d) # make sure we got all the messages we sent self.assertEqual( sent_msgs_count[0], sum([len(consumer.processor._messages) for consumer in consumers]))