def test_reads_writes(self): verifier_jar = "/opt/tx-verifier/tx-verifier.jar" self.redpanda.logger.info("creating topics") rpk = RpkTool(self.redpanda) rpk.create_topic("topic1", partitions=1, replicas=1) test = "concurrent-reads-writes" try: cmd = "{java} -jar {verifier_jar} {test} {brokers}".format( java="java", verifier_jar=verifier_jar, test=test, brokers=self.redpanda.brokers()) subprocess.check_output(["/bin/sh", "-c", cmd], stderr=subprocess.STDOUT) self.redpanda.logger.info( "txn test \"{test}\" passed".format(test=test)) except subprocess.CalledProcessError as e: self.redpanda.logger.info( "txn test \"{test}\" failed".format(test=test)) errors = "" errors += test + "\n" errors += str(e.output) + "\n" errors += "---------------------------\n" raise DucktapeError(errors)
def test_produce(self): verifier_bin = "/opt/redpanda-tests/go/sarama/produce_test/produce_test" self.redpanda.logger.info("creating topics") rpk = RpkTool(self.redpanda) rpk.create_topic("topic1") self.redpanda.logger.info("testing sarama produce") retries = 5 for i in range(0, retries): try: cmd = "{verifier_bin} --brokers {brokers}".format( verifier_bin=verifier_bin, brokers=self.redpanda.brokers()) subprocess.check_output(["/bin/sh", "-c", cmd], stderr=subprocess.STDOUT) self.redpanda.logger.info("sarama produce test passed") break except subprocess.CalledProcessError as e: error = str(e.output) self.redpanda.logger.info("sarama produce failed with " + error) if i + 1 != retries and NOT_LEADER_FOR_PARTITION in error: sleep(5) continue raise DucktapeError("sarama produce failed with " + error)
def verify(self, tests): verifier_jar = "/opt/tx-verifier/tx-verifier.jar" self.redpanda.logger.info("creating topics") rpk = RpkTool(self.redpanda) rpk.create_topic("topic1") rpk.create_topic("topic2") errors = "" for test in tests: self.redpanda.logger.info( "testing txn test \"{test}\"".format(test=test)) try: cmd = "{java} -jar {verifier_jar} {test} {brokers}".format( java="java", verifier_jar=verifier_jar, test=test, brokers=self.redpanda.brokers()) subprocess.check_output(["/bin/sh", "-c", cmd], stderr=subprocess.STDOUT) self.redpanda.logger.info( "txn test \"{test}\" passed".format(test=test)) except subprocess.CalledProcessError as e: self.redpanda.logger.info( "txn test \"{test}\" failed".format(test=test)) errors += test + "\n" errors += str(e.output) + "\n" errors += "---------------------------\n" if len(errors) > 0: raise DucktapeError(errors)
def test_tx_init_passes(self): rpk = RpkTool(self.redpanda) rpk.create_topic("topic1") producer = Producer({ "bootstrap.servers": self.redpanda.brokers(), "enable.idempotence": True, "transactional.id": "tx-id-1", "retries": 5 }) producer.init_transactions()
def test_idempotent_write_passes(self): rpk = RpkTool(self.redpanda) rpk.create_topic("topic1") producer = Producer({ "bootstrap.servers": self.redpanda.brokers(), "enable.idempotence": True, "retries": 5 }) producer.produce("topic1", key="key1".encode('utf-8'), value="value1".encode('utf-8'), callback=on_delivery) producer.flush()
def test_idempotency_compacted_topic(self): rpk = RpkTool(self.redpanda) rpk.create_topic("topic1", config={"cleanup.policy": "compact"}) producer = Producer({ "bootstrap.servers": self.redpanda.brokers(), "enable.idempotence": True, "retries": 5 }) producer.produce("topic1", key="key1".encode('utf-8'), value="value1".encode('utf-8'), callback=on_delivery) producer.flush()
class TopicAutocreateTest(RedpandaTest): """ Verify that autocreation works, and that the settings of an autocreated topic match those for a topic created by hand with rpk. """ def __init__(self, test_context): super(TopicAutocreateTest, self).__init__( test_context=test_context, num_brokers=1, extra_rp_conf={'auto_create_topics_enabled': False}) self.kafka_tools = KafkaCliTools(self.redpanda) self.rpk = RpkTool(self.redpanda) @cluster(num_nodes=1) def topic_autocreate_test(self): auto_topic = 'autocreated' manual_topic = "manuallycreated" # With autocreation disabled, producing to a nonexistent topic should not work. try: # Use rpk rather than kafka CLI because rpk errors out promptly self.rpk.produce(auto_topic, "foo", "bar") except Exception: # The write failed, and shouldn't have created a topic assert auto_topic not in self.kafka_tools.list_topics() else: assert False, "Producing to a nonexistent topic should fail" # Enable autocreation self.redpanda.restart_nodes(self.redpanda.nodes, {'auto_create_topics_enabled': True}) # Auto create topic assert auto_topic not in self.kafka_tools.list_topics() self.kafka_tools.produce(auto_topic, 1, 4096) assert auto_topic in self.kafka_tools.list_topics() auto_topic_spec = self.kafka_tools.describe_topic(auto_topic) assert auto_topic_spec.retention_ms is None assert auto_topic_spec.retention_bytes is None # Create topic by hand, compare its properties to the autocreated one self.rpk.create_topic(manual_topic) manual_topic_spec = self.kafka_tools.describe_topic(auto_topic) assert manual_topic_spec.retention_ms == auto_topic_spec.retention_ms assert manual_topic_spec.retention_bytes == auto_topic_spec.retention_bytes # Clear name and compare the rest of the attributes manual_topic_spec.name = auto_topic_spec.name = None assert manual_topic_spec == auto_topic_spec
def test_tx(self): verifier_jar = "/opt/tx-verifier/tx-verifier.jar" rpk = RpkTool(self.redpanda) rpk.create_topic("topic1") rpk.create_topic("topic2") self.redpanda.logger.error("starting tx verifier") try: cmd = ("{java} -jar {verifier_jar} {brokers}").format( java="java", verifier_jar=verifier_jar, brokers=self.redpanda.brokers()) subprocess.check_output(["/bin/sh", "-c", cmd], stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise DucktapeError("tx test failed: " + str(e.output))
def _restore_topic(self, topic_spec, overrides={}): """Restore individual topic""" self.logger.info(f"Restore topic called. Topic-manifest: {topic_spec}") conf = { 'redpanda.remote.recovery': 'true', #'redpanda.remote.write': 'true', } conf.update(overrides) self.logger.info(f"Confg: {conf}") topic = topic_spec.name npart = topic_spec.partition_count nrepl = topic_spec.replication_factor rpk = RpkTool(self.redpanda) rpk.create_topic(topic, npart, nrepl, conf) time.sleep(10) rpk.describe_topic(topic) rpk.describe_topic_configs(topic)
def test_memory_limited(self): """ Check enforcement of the RAM-per-partition threshold """ self.redpanda.set_resource_settings( ResourceSettings(memory_mb=1024, num_cpus=1)) self.redpanda.set_extra_rp_conf({ # Use a larger than default memory per partition, so that a 1GB system can be # tested without creating 1000 partitions (which overwhelms debug redpanda # builds because they're much slower than the real product) 'topic_memory_per_partition': 10 * 1024 * 1024, }) self.redpanda.start() rpk = RpkTool(self.redpanda) # Three nodes, each with 1GB memory, replicas=3, should # result in an effective limit of 1024 with the default # threshold of 1MB per topic. try: rpk.create_topic("toobig", partitions=110, replicas=3) except RpkException as e: assert 'INVALID_PARTITIONS' in e.msg else: assert False # Should succeed rpk.create_topic("okay", partitions=55, replicas=3) # Trying to grow the partition count in violation of the limit should fail try: rpk.add_topic_partitions("okay", 55) except RpkException as e: assert 'INVALID_PARTITIONS' in e.msg else: assert False # Growing the partition count within the limit should succeed rpk.add_topic_partitions("okay", 10)
def test_fd_limited(self): self.redpanda.set_resource_settings(ResourceSettings(nfiles=1000)) self.redpanda.set_extra_rp_conf({ # Disable memory limit: on a test node the physical memory can easily # be the limiting factor 'topic_memory_per_partition': None, }) self.redpanda.start() rpk = RpkTool(self.redpanda) # Default 10 fds per partition, we set ulimit down to 1000, so 100 should be the limit try: rpk.create_topic("toobig", partitions=110, replicas=3) except RpkException as e: assert 'INVALID_PARTITIONS' in e.msg else: assert False # Should succeed rpk.create_topic("okay", partitions=90, replicas=3)
def test_cpu_limited(self): """ Check enforcement of the partitions-per-core """ self.redpanda.set_resource_settings(ResourceSettings(num_cpus=1)) self.redpanda.set_extra_rp_conf({ # Disable memory limit: on a test node the physical memory can easily # be the limiting factor 'topic_memory_per_partition': None, # Disable FD enforcement: tests running on workstations may have low ulimits 'topic_fds_per_partition': None }) self.redpanda.start() rpk = RpkTool(self.redpanda) # Three nodes, each with 1 core, 7000 partition-replicas # per core, so with replicas=3, 7000 partitions should be the limit try: rpk.create_topic("toobig", partitions=8000, replicas=3) except RpkException as e: assert 'INVALID_PARTITIONS' in e.msg else: assert False try: rpk.create_topic("okay", partitions=6000, replicas=3) except RpkException as e: # Because this many partitions will overwhelm a debug build # of redpanda, we tolerate exceptions, as long as the exception # isn't about the partition count specifically. # # It would be better to execute this part of the test conditionally # on release builds only. assert 'INVALID_PARTITIONS' not in e.msg
class ManyPartitionsTest(PreallocNodesTest): """ Validates basic functionality in the presence of larger numbers of partitions than most other tests. """ topics = () def __init__(self, test_ctx, *args, **kwargs): self._ctx = test_ctx super(ManyPartitionsTest, self).__init__( test_ctx, *args, num_brokers=6, node_prealloc_count=1, extra_rp_conf={ # Disable leader balancer initially, to enable us to check for # stable leadership during initial elections and post-restart # elections. We will switch it on later, to exercise it during # the traffic stress test. 'enable_leader_balancer': False, }, # Usually tests run with debug or trace logs, but when testing resource # limits we want to test in a more production-like configuration. log_level='info', **kwargs) self.rpk = RpkTool(self.redpanda) def _all_elections_done(self, topic_names: list[str], p_per_topic: int): any_incomplete = False for tn in topic_names: partitions = list(self.rpk.describe_topic(tn)) if len(partitions) < p_per_topic: self.logger.info(f"describe omits partitions for topic {tn}") any_incomplete = True continue assert len(partitions) == p_per_topic for p in partitions: if p.leader == -1: self.logger.info( f"partition {tn}/{p.id} has no leader yet") any_incomplete = True return not any_incomplete def _consume_all(self, topic_names: list[str], msg_count_per_topic: int, timeout_per_topic: int): """ Don't do anything with the messages, just consume them to demonstrate that doing so does not exhaust redpanda resources. """ def consumer_saw_msgs(consumer): self.logger.info( f"Consumer message_count={consumer.message_count} / {msg_count_per_topic}" ) # Tolerate greater-than, because if there were errors during production # there can have been retries. return consumer.message_count >= msg_count_per_topic for tn in topic_names: consumer = RpkConsumer(self._ctx, self.redpanda, tn, save_msgs=False, fetch_max_bytes=BIG_FETCH, num_msgs=msg_count_per_topic) consumer.start() wait_until(lambda: consumer_saw_msgs(consumer), timeout_sec=timeout_per_topic, backoff_sec=5) consumer.stop() consumer.free() def setUp(self): # defer redpanda startup to the test, it might want to tweak # ResourceSettings based on its parameters. pass @cluster(num_nodes=7, log_allow_list=RESTART_LOG_ALLOW_LIST) def test_many_partitions(self): """ Validate that redpanda works with partition counts close to its resource limits. This test should evolve over time as we improve efficiency and can reliably run with higher partition counts. It should roughly track the values we use for topic_memory_per_partition and topic_fds_per_partition. * Check topic can be created. * Check leadership election succeeds for all partitions. * Write in enough data such that an unlimited size fetch would exhaust ram (check enforcement of kafka_max_bytes_per_fetch). * Consume all the data from the topic * Restart nodes several times (check that recovery works, and that the additional log segments created by rolling segments on restart do not cause us to exhaust resources. * Run a general produce+consume workload to check that the system remains in a functional state. """ # This test requires dedicated system resources to run reliably. #assert self.redpanda.dedicated_nodes # Scale tests are not run on debug builds assert not self.debug_mode replication_factor = 3 node_count = len(self.redpanda.nodes) # If we run on nodes with more memory than our HARD_PARTITION_LIMIT, then # artificially throttle the nodes' memory to avoid the test being too easy. # We are validating that the system works up to the limit, and that it works # up to the limit within the default per-partition memory footprint. node_memory = self.redpanda.get_node_memory_mb() # HARD_PARTITION_LIMIT is for a 3 node cluster, adjust according to # the number of nodes in this cluster. partition_limit = HARD_PARTITION_LIMIT * (node_count / 3) mb_per_partition = 1 # How much memory to reserve for internal partitions, such as # id_allocator. This is intentionally higher than needed, to # avoid having to update this test each time a new internal topic # is added. internal_partition_slack = 10 # Emulate seastar's policy for default reserved memory reserved_memory = max(1536, int(0.07 * node_memory) + 1) effective_node_memory = node_memory - reserved_memory # TODO: calculate an appropriate segment size for the disk space divided # by the partition count, then set an appropriate retention.bytes and # enable compaction, so that during the final stress period of the test, # we are exercising compaction. # Clamp memory if nodes have more memory than should be required # to exercise the partition limit. if effective_node_memory > HARD_PARTITION_LIMIT / mb_per_partition: clamp_memory = mb_per_partition * ( (HARD_PARTITION_LIMIT + internal_partition_slack) + reserved_memory) # Handy if hacking HARD_PARTITION_LIMIT to something low to run on a workstation clamp_memory = max(clamp_memory, 500) resource_settings = ResourceSettings(memory_mb=clamp_memory) self.redpanda.set_resource_settings(resource_settings) elif effective_node_memory < HARD_PARTITION_LIMIT / mb_per_partition: raise RuntimeError( f"Node memory is too small ({node_memory}MB - {reserved_memory}MB)" ) # Run with one huge topic: this is the more stressful case for Redpanda, compared # with multiple modestly-sized topics, so it's what we test to find the system's limits. n_topics = 1 # Partitions per topic n_partitions = int(partition_limit / n_topics) self.logger.info( f"Running partition scale test with {n_partitions} partitions on {n_topics} topics" ) self.redpanda.start() self.logger.info("Entering topic creation") topic_names = [f"scale_{i:06d}" for i in range(0, n_topics)] for tn in topic_names: self.logger.info( f"Creating topic {tn} with {n_partitions} partitions") self.rpk.create_topic(tn, partitions=n_partitions, replicas=replication_factor) self.logger.info(f"Awaiting elections...") wait_until(lambda: self._all_elections_done(topic_names, n_partitions), timeout_sec=60, backoff_sec=5) self.logger.info(f"Initial elections done.") for node in self.redpanda.nodes: files = self.redpanda.lsof_node(node) file_count = sum(1 for _ in files) self.logger.info( f"Open files after initial selection on {node.name}: {file_count}" ) # Assume fetches will be 10MB, the franz-go default fetch_mb_per_partition = 10 * 1024 * 1024 # * Need enough data that if a consumer tried to fetch it all at once # in a single request, it would run out of memory. OR the amount of # data that would fill a 10MB max_bytes per partition in a fetch, whichever # is lower (avoid writing excessive data for tests with fewer partitions). # * Then apply a factor of two to make sure we have enough data to drive writes # to disk during consumption, not just enough data to hold it all in the batch # cache. write_bytes_per_topic = min( int((self.redpanda.get_node_memory_mb() * 1024 * 1024) / n_topics), fetch_mb_per_partition * n_partitions) * 2 if self.scale.release: # Release tests can be much longer running: 10x the amount of # data we fire through the system write_bytes_per_topic *= 10 msg_size = 128 * 1024 msg_count_per_topic = int((write_bytes_per_topic / msg_size)) # Approx time to write or read all messages, for timeouts # Pessimistic bandwidth guess, accounting for the sub-disk bandwidth # that a single-threaded consumer may see expect_bandwidth = 50 * 1024 * 1024 expect_transmit_time = int(write_bytes_per_topic / expect_bandwidth) expect_transmit_time = max(expect_transmit_time, 30) self.logger.info("Entering initial produce") for tn in topic_names: t1 = time.time() producer = FranzGoVerifiableProducer( self.test_context, self.redpanda, tn, msg_size, msg_count_per_topic, custom_node=self.preallocated_nodes) producer.start() producer.wait(timeout_sec=expect_transmit_time) self.free_preallocated_nodes() duration = time.time() - t1 self.logger.info( f"Wrote {write_bytes_per_topic} bytes to {tn} in {duration}s, bandwidth {(write_bytes_per_topic / duration)/(1024 * 1024)}MB/s" ) def get_fd_counts(): counts = {} with concurrent.futures.ThreadPoolExecutor( max_workers=node_count) as executor: futs = {} for node in self.redpanda.nodes: futs[node.name] = executor.submit( lambda: sum(1 for _ in self.redpanda.lsof_node(node))) for node_name, fut in futs.items(): file_count = fut.result() counts[node_name] = file_count return counts for node_name, file_count in get_fd_counts().items(): self.logger.info( f"Open files before restarts on {node_name}: {file_count}") # Over large partition counts, the startup time is linear with the # amount of data we played in, because no one partition gets far # enough to snapshot. expect_start_time = expect_transmit_time # Measure the impact of restarts on resource utilization on an idle system: # at time of writing we know that the used FD count will go up substantially # on each restart (https://github.com/redpanda-data/redpanda/issues/4057) restart_count = 2 self.logger.info("Entering restart stress test") for i in range(1, restart_count + 1): self.logger.info(f"Cluster restart {i}/{restart_count}...") # Normal restarts are rolling restarts, but because replay takes substantial time, # on an idle system it is helpful to do a concurrent global restart rather than # waiting for each node one by one. with concurrent.futures.ThreadPoolExecutor( max_workers=node_count) as executor: futs = [] for node in self.redpanda.nodes: futs.append( executor.submit(self.redpanda.restart_nodes, nodes=[node], start_timeout=expect_start_time)) for f in futs: # Raise on error f.result() self.logger.info( f"Restart {i}/{restart_count} complete. Waiting for elections..." ) wait_until( lambda: self._all_elections_done(topic_names, n_partitions), timeout_sec=60, backoff_sec=5) self.logger.info(f"Post-restart elections done.") for node_name, file_count in get_fd_counts().items(): self.logger.info( f"Open files after {i} restarts on {node_name}: {file_count}" ) # With increased overhead from all those segment rolls during restart, # check that consume still works. self._consume_all(topic_names, msg_count_per_topic, expect_transmit_time) # Now that we've tested basic ability to form consensus and survive some # restarts, move on to a more general stress test. self.logger.info("Entering traffic stress test") target_topic = topic_names[0] stress_msg_size = 32768 stress_data_size = 1024 * 1024 * 1024 * 100 stress_msg_count = int(stress_data_size / stress_msg_size) fast_producer = FranzGoVerifiableProducer( self.test_context, self.redpanda, target_topic, stress_msg_size, stress_msg_count, custom_node=self.preallocated_nodes) fast_producer.start() # Don't start consumers until the producer has written out its first # checkpoint with valid ranges. wait_until(lambda: fast_producer.produce_status.acked > 0, timeout_sec=30, backoff_sec=1.0) rand_consumer = FranzGoVerifiableRandomConsumer( self.test_context, self.redpanda, target_topic, 0, 100, 10, nodes=self.preallocated_nodes) rand_consumer.start(clean=False) rand_consumer.shutdown() rand_consumer.wait() fast_producer.wait() seq_consumer = FranzGoVerifiableSeqConsumer(self.test_context, self.redpanda, target_topic, 0, self.preallocated_nodes) seq_consumer.start(clean=False) seq_consumer.shutdown() seq_consumer.wait() assert seq_consumer.consumer_status.invalid_reads == 0 assert seq_consumer.consumer_status.valid_reads == stress_msg_count + msg_count_per_topic self.logger.info("Entering leader balancer stress test") # Enable the leader balancer and check that the system remains stable # under load. We do not leave the leader balancer on for most of the test, because # it makes reads _much_ slower, because the consumer keeps stalling and waiting for # elections: at any moment in a 10k partition topic, it's highly likely at least # one partition is offline for a leadership migration. self.redpanda.set_cluster_config({'enable_leader_balancer': True}, expect_restart=False) lb_stress_period = 120 lb_stress_produce_bytes = expect_bandwidth * lb_stress_period lb_stress_message_count = int(lb_stress_produce_bytes / stress_msg_size) fast_producer = FranzGoVerifiableProducer( self.test_context, self.redpanda, target_topic, stress_msg_size, lb_stress_message_count, custom_node=self.preallocated_nodes) fast_producer.start() rand_consumer.start() time.sleep(lb_stress_period ) # Let the system receive traffic for a set time period rand_consumer.shutdown() rand_consumer.wait() fast_producer.wait()
class RpkToolTest(RedpandaTest): def __init__(self, ctx): super(RpkToolTest, self).__init__(test_context=ctx) self._ctx = ctx self._rpk = RpkTool(self.redpanda) def test_create_topic(self): self._rpk.create_topic("topic") wait_until(lambda: "topic" in self._rpk.list_topics(), timeout_sec=10, backoff_sec=1, err_msg="Topic never appeared.") def test_produce(self): topic = 'topic' message = 'message' key = 'key' h_key = 'h_key' h_value = 'h_value' headers = [h_key + ':' + h_value] self._rpk.create_topic(topic) self._rpk.produce(topic, key, message, headers) c = RpkConsumer(self._ctx, self.redpanda, topic) c.start() def cond(): return len(c.messages) == 1 \ and c.messages[0]['message'] == message \ and c.messages[0]['key'] == key \ and c.messages[0]['headers'] == [ {'key': h_key, 'value': h_value}, ] wait_until(cond, timeout_sec=30, backoff_sec=2, err_msg="Message didn't appear.") def test_consume_as_group(self): topic = 'topic_group' message = 'message' key = 'key' h_key = 'h_key' h_value = 'h_value' headers = [h_key + ':' + h_value] self._rpk.create_topic(topic) c = RpkConsumer(self._ctx, self.redpanda, topic, group='group') c.start() def cond(): if c.error: raise c.error self._rpk.produce(topic, key, message, headers) return c.messages \ and c.messages[0]['message'] == message \ and c.messages[0]['key'] == key \ and c.messages[0]['headers'] == [ {'key': h_key, 'value': h_value}, ] wait_until(cond, timeout_sec=30, backoff_sec=8, err_msg="Message didn't appear.") def test_consume_newest(self): topic = 'topic_newest' message = 'message' key = 'key' h_key = 'h_key' h_value = 'h_value' headers = [h_key + ':' + h_value] self._rpk.create_topic(topic) # Gotta sleep to make sure the topic is replicated and the # consumer doesn't fail. time.sleep(5) c = RpkConsumer(self._ctx, self.redpanda, topic, offset='newest') c.start() def cond(): if c.error: raise c.error self._rpk.produce(topic, key, message, headers) return c.messages \ and c.messages[0]['message'] == message \ and c.messages[0]['key'] == key \ and c.messages[0]['headers'] == [ {'key': h_key, 'value': h_value}, ] wait_until(cond, timeout_sec=30, backoff_sec=8, err_msg="Message didn't appear.") def test_consume_oldest(self): topic = 'topic' n = random.randint(10, 100) msgs = {} for i in range(n): msgs['key-' + str(i)] = 'message-' + str(i) # Produce messages for k in msgs: self._rpk.produce(topic, k, msgs[k]) c = RpkConsumer(self._ctx, self.redpanda, topic) c.start() def cond(): # Consume from the beginning if len(c.messages) != len(msgs): return False for m in c.messages: key = m['key'] if key is None: return False if m['message'] != msgs[key]: return False return True wait_until(cond, timeout_sec=30, backoff_sec=8, err_msg="Message didn't appear.") def test_consume_from_partition(self): topic = 'topic_partition' n_parts = random.randint(3, 100) self._rpk.create_topic(topic, partitions=n_parts) n = random.randint(10, 30) msgs = {} for i in range(n): msgs['key-' + str(i)] = 'message-' + str(i) part = random.randint(0, n_parts) # Produce messages to a random partition for k in msgs: self._rpk.produce(topic, k, msgs[k], partition=part) # Consume from the beginning c = RpkConsumer(self._ctx, self.redpanda, topic, offset='oldest', partitions=[part]) c.start() def cond(): if len(c.messages) != len(msgs): return False for m in c.messages: key = m['key'] if key is None: return False if m['message'] != msgs[key]: return False return True wait_until(cond, timeout_sec=10, backoff_sec=1, err_msg="Message didn't appear.")
def test_recreated_topic_metadata_are_valid(self, replication_factor): """ Test recreated topic metadata are valid across all the nodes """ topic = 'tp-test' partition_count = 5 rpk = RpkTool(self.redpanda) kcat = KafkaCat(self.redpanda) admin = Admin(self.redpanda) # create topic with replication factor of 3 rpk.create_topic(topic='tp-test', partitions=partition_count, replicas=replication_factor) # produce some data to the topic def wait_for_leader(partition, expected_leader): leader, _ = kcat.get_partition_leader(topic, partition) return leader == expected_leader def transfer_all_leaders(): partitions = rpk.describe_topic(topic) for p in partitions: replicas = set(p.replicas) replicas.remove(p.leader) target = random.choice(list(replicas)) admin.partition_transfer_leadership("kafka", topic, p.id, target) wait_until(lambda: wait_for_leader(p.id, target), timeout_sec=30, backoff_sec=1) msg_cnt = 100 producer = RpkProducer(self.test_context, self.redpanda, topic, 16384, msg_cnt, acks=-1) producer.start() producer.wait() producer.free() # transfer leadership to grow the term for i in range(0, 10): transfer_all_leaders() # recreate the topic rpk.delete_topic(topic) rpk.create_topic(topic='tp-test', partitions=partition_count, replicas=3) def metadata_consistent(): # validate leadership information on each node for p in range(0, partition_count): leaders = set() for n in self.redpanda.nodes: admin_partition = admin.get_partitions(topic=topic, partition=p, namespace="kafka", node=n) self.logger.info( f"node: {n.account.hostname} partition: {admin_partition}" ) leaders.add(admin_partition['leader_id']) self.logger.info(f"{topic}/{p} leaders: {leaders}") if len(leaders) != 1: return False return True wait_until(metadata_consistent, 45, backoff_sec=2)
class RpkToolTest(RedpandaTest): def __init__(self, ctx): super(RpkToolTest, self).__init__(test_context=ctx) self._ctx = ctx self._rpk = RpkTool(self.redpanda) @cluster(num_nodes=3) def test_create_topic(self): self._rpk.create_topic("topic") wait_until(lambda: "topic" in self._rpk.list_topics(), timeout_sec=10, backoff_sec=1, err_msg="Topic never appeared.") @cluster(num_nodes=4) def test_produce(self): topic = 'topic' message = 'message' key = 'key' h_key = 'h_key' h_value = 'h_value' headers = [h_key + ':' + h_value] self._rpk.create_topic(topic) self._rpk.produce(topic, key, message, headers) c = RpkConsumer(self._ctx, self.redpanda, topic) c.start() def cond(): return c.messages is not None \ and len(c.messages) == 1 \ and c.messages[0]['value'] == message \ and c.messages[0]['key'] == key \ and c.messages[0]['headers'] == [ {'key': h_key, 'value': h_value}, ] wait_until(cond, timeout_sec=120, backoff_sec=30, err_msg="Message didn't appear.") @cluster(num_nodes=4) def test_consume_as_group(self): topic = 'topic_group' message = 'message' key = 'key' h_key = 'h_key' h_value = 'h_value' headers = [h_key + ':' + h_value] self._rpk.create_topic(topic) c = RpkConsumer(self._ctx, self.redpanda, topic, group='group') c.start() def cond(): if c.error: raise c.error self._rpk.produce(topic, key, message, headers) return c.messages \ and c.messages[0]['value'] == message \ and c.messages[0]['key'] == key \ and c.messages[0]['headers'] == [ {'key': h_key, 'value': h_value}, ] wait_until(cond, timeout_sec=120, backoff_sec=15, err_msg="Message didn't appear.") @cluster(num_nodes=4) def test_consume_newest(self): topic = 'topic_newest' message = 'newest message' key = 'key' h_key = 'h_key' h_value = 'h_value' headers = [h_key + ':' + h_value] self._rpk.create_topic(topic) c = RpkConsumer(self._ctx, self.redpanda, topic, offset='newest') c.start() def cond(): if c.error: raise c.error self._rpk.produce(topic, key, message, headers) return c.messages \ and c.messages[0]['value'] == message \ and c.messages[0]['key'] == key \ and c.messages[0]['headers'] == [ {'key': h_key, 'value': h_value}, ] wait_until(cond, timeout_sec=150, backoff_sec=30, err_msg="Message didn't appear.") @cluster(num_nodes=4) def test_consume_oldest(self): topic = 'topic' n = random.randint(10, 100) msgs = {} for i in range(n): msgs['key-' + str(i)] = 'message-' + str(i) self._rpk.create_topic(topic) # Produce messages for k in msgs: self._rpk.produce(topic, k, msgs[k]) c = RpkConsumer(self._ctx, self.redpanda, topic) c.start() def cond(): # Consume from the beginning if len(c.messages) != len(msgs): return False for m in c.messages: key = m['key'] if key is None: return False if m['value'] != msgs[key]: return False return True wait_until(cond, timeout_sec=60, backoff_sec=20, err_msg="Message didn't appear.") @cluster(num_nodes=4) def test_consume_from_partition(self): topic = 'topic_partition' n_parts = random.randint(3, 100) self._rpk.create_topic(topic, partitions=n_parts) n = random.randint(10, 30) msgs = {} for i in range(n): msgs['key-' + str(i)] = 'message-' + str(i) part = random.randint(0, n_parts - 1) # Produce messages to a random partition for k in msgs: self._rpk.produce(topic, k, msgs[k], partition=part) # Consume from the beginning c = RpkConsumer(self._ctx, self.redpanda, topic, offset='oldest', partitions=[part]) c.start() def cond(): if len(c.messages) != len(msgs): return False for m in c.messages: key = m['key'] if key is None: return False if m['value'] != msgs[key]: return False return True # timeout loop, but reset the timeout if we appear to be making progress retries = 10 prev_msg_count = len(c.messages) while retries > 0: self.redpanda.logger.debug( f"Message count {len(c.messages)} retries {retries}") if cond(): return if len(c.messages) > prev_msg_count: prev_msg_count = len(c.messages) retries = 10 time.sleep(1) retries -= 1 raise ducktape.errors.TimeoutError("Message didn't appear")
def test_deletion_stops_move(self): """ Delete topic which partitions are being moved and check status after topic is created again, old move opeartions should not influcence newly created topic """ self.start_redpanda(num_nodes=3) # create a single topic with replication factor of 1 topic = 'test-topic' rpk = RpkTool(self.redpanda) rpk.create_topic(topic, 1, 1) partition = 0 num_records = 1000 self.logger.info(f"Producing to {topic}") producer = KafProducer(self.test_context, self.redpanda, topic, num_records) producer.start() self.logger.info( f"Finished producing to {topic}, waiting for producer...") producer.wait() producer.free() self.logger.info(f"Producer stop complete.") admin = Admin(self.redpanda) # get current assignments assignments = self._get_assignments(admin, topic, partition) assert len(assignments) == 1 self.logger.info(f"assignments for {topic}-{partition}: {assignments}") brokers = admin.get_brokers() self.logger.info(f"available brokers: {brokers}") candidates = list( filter(lambda b: b['node_id'] != assignments[0]['node_id'], brokers)) replacement = random.choice(candidates) target_assignment = [{'node_id': replacement['node_id'], 'core': 0}] self.logger.info( f"target assignments for {topic}-{partition}: {target_assignment}") # shutdown target node to make sure that move will never complete node = self.redpanda.get_node(replacement['node_id']) self.redpanda.stop_node(node) admin.set_partition_replicas(topic, partition, target_assignment) # check that the status is in progress def get_status(): partition_info = admin.get_partitions(topic, partition) self.logger.info( f"current assignments for {topic}-{partition}: {partition_info}" ) return partition_info["status"] wait_until(lambda: get_status() == 'in_progress', 10, 1) # delete the topic rpk.delete_topic(topic) # start the node back up self.redpanda.start_node(node) # create topic again rpk.create_topic(topic, 1, 1) wait_until(lambda: get_status() == 'done', 10, 1)