def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService( test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService( test_context, self.kafka) self.active_topics = { "consume_bench_topic[0-5]": { "numPartitions": 5, "replicationFactor": 3 } } self.trogdor = TrogdorService(context=self.test_context, client_services=[ self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2 ])
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics)
def set_up_trogdor(self, num_agent_nodes): self.agent_nodes = self.test_context.cluster.alloc( ClusterSpec.simple_linux(num_agent_nodes)) self.trogdor = TrogdorService(context=self.test_context, agent_nodes=self.agent_nodes) for agent_node in self.agent_nodes: agent_node.account.logger = self.trogdor.logger self.trogdor.start()
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.kafka, self.workload_service])
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service]) self.active_topics = {"produce_bench_topic[0-1]": {"numPartitions": 1, "replicationFactor": 3}} self.inactive_topics = {"produce_bench_topic[2-9]": {"numPartitions": 1, "replicationFactor": 3}} def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_produce_bench(self): spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_produce_bench_transactions(self): spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics, transaction_generator={ # 10 transactions with 10k messages "type": "uniform", "messagesPerTransaction": "10000" }) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service])
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, partition_assignments={0: [0, 1, 2]}, max_messages=100000)
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.workload_service]) self.active_topics = {"produce_bench_topic[0-1]": {"numPartitions": 1, "replicationFactor": 3}} self.inactive_topics = {"produce_bench_topic[2-9]": {"numPartitions": 1, "replicationFactor": 3}}
def test_produce_consume(self, topic_count, partition_count, replication_factor): topics_create_start_time = time.time() for i in range(topic_count): topic = "replicas_produce_consume_%d" % i print("Creating topic %s" % topic) # Force some stdout for Jenkins topic_cfg = { "topic": topic, "partitions": partition_count, "replication-factor": replication_factor, "configs": {"min.insync.replicas": 2} } self.kafka.create_topic(topic_cfg) topics_create_end_time = time.time() self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time)) producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka) consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka) trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, producer_workload_service, consumer_workload_service]) trogdor.start() produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, producer_workload_service.producer_node, producer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics={"replicas_produce_consume_[0-2]": { "numPartitions": partition_count, "replicationFactor": replication_factor }}) produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec) produce_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed produce bench") consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, consumer_workload_service.consumer_node, consumer_workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=3400000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["replicas_produce_consume_[0-2]"]) consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec) consume_workload.wait_for_done(timeout_sec=600) self.logger.info("Completed consume bench") trogdor.stop()
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.redpanda = RedpandaService(test_context, num_nodes=3) self.workload_service = ProduceBenchWorkloadService( test_context, self.redpanda) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.redpanda, self.workload_service]) self.active_topics = { "produce_bench_topic[0-1]": { "numPartitions": 1, "replicationFactor": 3 } } self.inactive_topics = { "produce_bench_topic[2-9]": { "numPartitions": 1, "replicationFactor": 3 } }
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.redpanda = RedpandaService(test_context, num_nodes=3) self.workload_service = ProduceBenchWorkloadService( test_context, self.redpanda) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.redpanda, self.workload_service]) self.active_topics = { "produce_bench_topic[0-1]": { "numPartitions": 1, "replicationFactor": 3 } } self.inactive_topics = { "produce_bench_topic[2-9]": { "numPartitions": 1, "replicationFactor": 3 } } def setUp(self): self.trogdor.start() self.redpanda.start() def teardown(self): self.trogdor.stop() self.redpanda.stop() @cluster(num_nodes=3) def test_produce_bench(self): spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics) workload1 = self.trogdor.create_task("workload1", spec) # the trogdor service logs all requests() operations to INFO level, # which is too verbose. We explicitly change the level to WARNING and # set it back after the wait_for_done function returns self.trogdor.logger.setLevel('WARNING') workload1.wait_for_done(timeout_sec=360) # set it back to info self.trogdor.logger.setLevel('INFO') tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) if quorum.for_test(test_context) == quorum.zk: trogdor_client_services = [ self.zk, self.kafka, self.workload_service ] elif quorum.for_test(test_context) == quorum.remote_kraft: trogdor_client_services = [ self.kafka.controller_quorum, self.kafka, self.workload_service ] else: #co-located case, which we currently don't test but handle here for completeness in case we do test it trogdor_client_services = [self.kafka, self.workload_service] self.trogdor = TrogdorService(context=self.test_context, client_services=trogdor_client_services) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 # note that the broker.id values will be 1..num_nodes active_topics = { topic_name: { "partitionAssignments": { "0": [1, 2, 3] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics)
def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}} self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics)
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.kafka, self.workload_service]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_produce_bench(self): active_topics = { "produce_bench_topic[0-1]": { "numPartitions": 1, "replicationFactor": 3 } } inactive_topics = { "produce_bench_topic[2-9]": { "numPartitions": 1, "replicationFactor": 3 } } spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=inactive_topics, active_topics=active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) if quorum.for_test(test_context) == quorum.zk: trogdor_client_services = [ self.zk, self.kafka, self.workload_service ] elif quorum.for_test(test_context) == quorum.remote_kraft: trogdor_client_services = [ self.kafka.controller_quorum, self.kafka, self.workload_service ] else: #co-located case, which we currently don't test but handle here for completeness in case we do test it trogdor_client_services = [self.kafka, self.workload_service] self.trogdor = TrogdorService(context=self.test_context, client_services=trogdor_client_services) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): if self.zk: self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() def remote_quorum_nodes(self): if quorum.for_test(self.test_context) == quorum.zk: return self.zk.nodes elif quorum.for_test(self.test_context) == quorum.remote_kraft: return self.kafka.controller_quorum.nodes else: # co-located case, which we currently don't test but handle here for completeness in case we do test it return [] @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_round_trip_workload(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_round_trip_workload_with_broker_partition( self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0] ] + self.remote_quorum_nodes() partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_broker_pause(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_client_partition(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.remote_quorum_nodes() partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() @cluster(num_nodes=9) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_consume_with_latency(self, metadata_quorum=quorum.zk): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) spec = DegradedNetworkFaultSpec(0, 60000) for node in self.kafka.nodes + self.remote_quorum_nodes(): spec.add_node_spec(node.name, "eth0", latencyMs=100, rateLimitKbit=3000) slow1 = self.trogdor.create_task("slow1", spec) workload1.wait_for_done(timeout_sec=600) slow1.stop() slow1.wait_for_done()
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService(test_context, self.kafka) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics={topic_name : {"partitionAssignments":{"0": [0,1,2]}}} self.round_trip_spec = RoundTripWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_round_trip_workload(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) def test_round_trip_workload_with_broker_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0]] + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() def test_produce_consume_with_broker_pause(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) def test_produce_consume_with_client_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done()
def set_up_trogdor(self, num_agent_nodes): self.agent_nodes = self.test_context.cluster.alloc(ClusterSpec.simple_linux(num_agent_nodes)) self.trogdor = TrogdorService(context=self.test_context, agent_nodes=self.agent_nodes) for agent_node in self.agent_nodes: agent_node.account.logger = self.trogdor.logger self.trogdor.start()
class TrogdorTest(Test): """ Tests the Trogdor fault injection daemon in isolation. """ def __init__(self, test_context): super(TrogdorTest, self).__init__(test_context) def set_up_trogdor(self, num_agent_nodes): self.agent_nodes = self.test_context.cluster.alloc(ClusterSpec.simple_linux(num_agent_nodes)) self.trogdor = TrogdorService(context=self.test_context, agent_nodes=self.agent_nodes) for agent_node in self.agent_nodes: agent_node.account.logger = self.trogdor.logger self.trogdor.start() def setUp(self): self.trogdor = None self.agent_nodes = None def tearDown(self): if self.trogdor is not None: self.trogdor.stop() self.trogdor = None if self.agent_nodes is not None: self.test_context.cluster.free(self.agent_nodes) self.agent_nodes = None @cluster(num_nodes=4) def test_trogdor_service(self): """ Test that we can bring up Trogdor and create a no-op fault. """ self.set_up_trogdor(3) spec = NoOpTaskSpec(0, TaskSpec.MAX_DURATION_MS) self.trogdor.create_task("myfault", spec) def check_for_myfault(): faults = self.trogdor.tasks()["tasks"] self.logger.info("tasks = %s" % faults) return "myfault" in faults wait_until(lambda: check_for_myfault, timeout_sec=10, backoff_sec=.2, err_msg="Failed to read back myfault.") self.trogdor.stop_task("myfault") @cluster(num_nodes=4) def test_network_partition_fault(self): """ Test that the network partition fault results in a true network partition between nodes. """ self.set_up_trogdor(3) spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [[self.agent_nodes[0]], self.agent_nodes[1:]]) partitions = spec.message["partitions"] assert 2 == len(partitions) assert [self.agent_nodes[0].name] == partitions[0] assert [self.agent_nodes[1].name, self.agent_nodes[2].name] == partitions[1] self.trogdor.create_task("partition0", spec) def verify_nodes_partitioned(): if node_is_reachable(self.agent_nodes[0], self.agent_nodes[1]): return False if node_is_reachable(self.agent_nodes[1], self.agent_nodes[0]): return False if node_is_reachable(self.agent_nodes[2], self.agent_nodes[0]): return False return True wait_until(lambda: verify_nodes_partitioned, timeout_sec=10, backoff_sec=.2, err_msg="Failed to verify that the nodes were partitioned.") if not node_is_reachable(self.agent_nodes[0], self.agent_nodes[0]): raise RuntimeError("Node 0 must be reachable from itself.") if not node_is_reachable(self.agent_nodes[1], self.agent_nodes[2]): raise RuntimeError("Node 2 must be reachable from node 1.") if not node_is_reachable(self.agent_nodes[2], self.agent_nodes[1]): raise RuntimeError("Node 1 must be reachable from node 2.")
class ProduceBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ProduceBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.kafka, self.workload_service]) self.active_topics = { "produce_bench_topic[0-1]": { "numPartitions": 1, "replicationFactor": 3 } } self.inactive_topics = { "produce_bench_topic[2-9]": { "numPartitions": 1, "replicationFactor": 3 } } def setUp(self): self.trogdor.start() if self.zk: self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() @cluster(num_nodes=8) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_produce_bench(self, metadata_quorum=quorum.zk): spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=8) def test_produce_bench_transactions(self, metadata_quorum=quorum.zk): spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.producer_node, self.workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=100000, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics=self.inactive_topics, active_topics=self.active_topics, transaction_generator={ # 10 transactions with 10k messages "type": "uniform", "messagesPerTransaction": "10000" }) workload1 = self.trogdor.create_task("workload1", spec) workload1.wait_for_done(timeout_sec=360) tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
class RoundTripFaultTest(Test): topic_name_index = 0 def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(RoundTripFaultTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk) self.workload_service = RoundTripWorkloadService( test_context, self.kafka) self.trogdor = TrogdorService( context=self.test_context, client_services=[self.zk, self.kafka, self.workload_service]) topic_name = "round_trip_topic%d" % RoundTripFaultTest.topic_name_index RoundTripFaultTest.topic_name_index = RoundTripFaultTest.topic_name_index + 1 active_topics = { topic_name: { "partitionAssignments": { "0": [0, 1, 2] } } } self.round_trip_spec = RoundTripWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.workload_service.client_node, self.workload_service.bootstrap_servers, target_messages_per_sec=10000, max_messages=100000, active_topics=active_topics) def setUp(self): self.zk.start() self.kafka.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def test_round_trip_workload(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) workload1.wait_for_done(timeout_sec=600) def test_round_trip_workload_with_broker_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.kafka.nodes[0]] part2 = self.kafka.nodes[1:] + [self.workload_service.nodes[0] ] + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, TaskSpec.MAX_DURATION_MS, [part1, part2]) partition1 = self.trogdor.create_task("partition1", partition1_spec) workload1.wait_for_done(timeout_sec=600) partition1.stop() partition1.wait_for_done() def test_produce_consume_with_broker_pause(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) stop1_spec = ProcessStopFaultSpec(0, TaskSpec.MAX_DURATION_MS, [self.kafka.nodes[0]], self.kafka.java_class_name()) stop1 = self.trogdor.create_task("stop1", stop1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done() self.kafka.stop_node(self.kafka.nodes[0], False) def test_produce_consume_with_client_partition(self): workload1 = self.trogdor.create_task("workload1", self.round_trip_spec) time.sleep(2) part1 = [self.workload_service.nodes[0]] part2 = self.kafka.nodes + self.zk.nodes partition1_spec = NetworkPartitionFaultSpec(0, 60000, [part1, part2]) stop1 = self.trogdor.create_task("stop1", partition1_spec) workload1.wait_for_done(timeout_sec=600) stop1.stop() stop1.wait_for_done()
def test_replication_with_replica_failure(self, metadata_quorum=quorum.zk): """ This test verifies that replication shrinks the ISR when a replica is not fetching anymore. It also verifies that replication provides simple durability guarantees by checking that data acked by brokers is still available for consumption. Setup: 1 zk/KRaft controller, 3 kafka nodes, 1 topic with partitions=1, replication-factor=3, and min.insync.replicas=2 - Produce messages in the background - Consume messages in the background - Partition a follower - Validate that the ISR was shrunk - Stop producing and finish consuming - Validate that every acked message was consumed """ self.create_zookeeper_if_necessary() if self.zk: self.zk.start() self.create_kafka( num_nodes=3, server_prop_overrides=[["replica.lag.time.max.ms", "10000"]], controller_num_nodes_override=1) self.kafka.start() self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka]) self.trogdor.start() # If ZK is used, the partition leader is put on the controller node # to avoid partitioning the controller later on in the test. if self.zk: controller = self.kafka.controller() assignment = [self.kafka.idx(controller)] + [ self.kafka.idx(node) for node in self.kafka.nodes if node != controller ] else: assignment = [self.kafka.idx(node) for node in self.kafka.nodes] self.topic = "test_topic" self.kafka.create_topic({ "topic": self.topic, "replica-assignment": ":".join(map(str, assignment)), "configs": { "min.insync.replicas": 2 } }) self.logger.info("Created topic %s with assignment %s", self.topic, ", ".join(map(str, assignment))) self.create_producer() self.producer.start() self.create_consumer() self.consumer.start() self.await_startup() leader = self.kafka.leader(self.topic, partition=0) replicas = self.kafka.replicas(self.topic, partition=0) # One of the followers is picked to be partitioned. follower_to_partition = [ replica for replica in replicas if replica != leader ][0] self.logger.info( "Partitioning follower %s (%s) from the other brokers", self.kafka.idx(follower_to_partition), follower_to_partition.name) partition_spec = NetworkPartitionFaultSpec( 0, 5 * 60 * 1000, [[follower_to_partition], [ node for node in self.kafka.nodes if node != follower_to_partition ]]) partition = self.trogdor.create_task("partition", partition_spec) def current_isr(): try: # Due to the network partition, the kafka-topics command could fail if it tries # to connect to the partitioned broker. Therefore we catch the error here and retry. return set( self.kafka.isr_idx_list( self.topic, partition=0, node=leader, offline_nodes=[follower_to_partition])) except RemoteCommandError as e: return set() # Verify that ISR is shrunk. expected_isr = { self.kafka.idx(replica) for replica in replicas if replica != follower_to_partition } wait_until(lambda: current_isr() == expected_isr, timeout_sec=120, backoff_sec=1, err_msg="ISR should have been shrunk.") # Wait until the network partition is removed. partition.stop() partition.wait_for_done(timeout_sec=300) # Verify that ISR is expanded. expected_isr = {self.kafka.idx(replica) for replica in replicas} wait_until(lambda: current_isr() == expected_isr, timeout_sec=120, backoff_sec=1, err_msg="ISR should have been expanded.") self.run_validation(producer_timeout_sec=120, min_records=25000)
class ConsumeBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka) self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}} self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def produce_messages(self, topics, max_messages=10000): produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.producer_workload_service.producer_node, self.producer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=max_messages, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics=topics) produce_workload = self.trogdor.create_task("produce_workload", produce_spec) produce_workload.wait_for_done(timeout_sec=180) self.logger.debug("Produce workload finished") @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"]) # manual topic assignment def test_consume_bench(self, topics): """ Runs a ConsumeBench workload to consume messages """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=10000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=topics) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_consume_bench_single_partition(self): """ Run a ConsumeBench against a single partition """ active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}} self.produce_messages(active_topics, 5000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2500, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["consume_bench_topic:1"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=180) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_consume_group_bench(self): """ Runs two ConsumeBench workloads in the same consumer group to read messages from topics """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, # both should read at least 2k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, consumer_group="testGroup", active_topics=["consume_bench_topic[0-5]"]) consume_workload_1 = self.trogdor.create_task("consume_workload_1", consume_spec) consume_workload_2 = self.trogdor.create_task("consume_workload_2", consume_spec) consume_workload_1.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload 1 finished") consume_workload_2.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload 2 finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2))
def __init__(self, test_context): super(NetworkDegradeTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk])
class NetworkDegradeTest(Test): """ These tests ensure that the network degrade Trogdor specs (which use "tc") are working as expected in whatever environment the system tests may be running in. The linux tools "ping" and "iperf" are used for validation and need to be available along with "tc" in the test environment. """ def __init__(self, test_context): super(NetworkDegradeTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.trogdor = TrogdorService(context=self.test_context, client_services=[self.zk]) def setUp(self): self.zk.start() self.trogdor.start() def teardown(self): self.trogdor.stop() self.zk.stop() @cluster(num_nodes=5) @parametrize(task_name="latency-100", device_name="eth0", latency_ms=50, rate_limit_kbit=0) @parametrize(task_name="latency-100-rate-1000", device_name="eth0", latency_ms=50, rate_limit_kbit=1000) def test_latency(self, task_name, device_name, latency_ms, rate_limit_kbit): spec = DegradedNetworkFaultSpec(0, 10000) for node in self.zk.nodes: spec.add_node_spec(node.name, device_name, latency_ms, rate_limit_kbit) latency = self.trogdor.create_task(task_name, spec) zk0 = self.zk.nodes[0] zk1 = self.zk.nodes[1] # Capture the ping times from the ping stdout # 64 bytes from ducker01 (172.24.0.2): icmp_seq=1 ttl=64 time=0.325 ms r = re.compile(r".*time=(?P<time>[\d.]+)\sms.*") times = [] for line in zk0.account.ssh_capture("ping -i 1 -c 20 %s" % zk1.account.hostname): self.logger.debug("Ping output: %s" % line) m = r.match(line) if m is not None and m.group("time"): times.append(float(m.group("time"))) self.logger.info("Parsed ping time of %d" % float(m.group("time"))) self.logger.debug("Captured ping times: %s" % times) # We expect to see some low ping times (before and after the task runs) as well as high ping times # (during the task). For the high time, it's twice the configured latency since both links apply the # rule, 80% for a little variance buffer high_time_ms = 0.8 * 2 * latency_ms low_time_ms = 10 slow_times = [t for t in times if t > high_time_ms] fast_times = [t for t in times if t < low_time_ms] latency.stop() latency.wait_for_done() # We captured 20 ping times. Assert that at least 5 were "fast" and 5 were "slow" assert len(slow_times) > 5, "Expected to see more slow ping times (lower than %d)" % low_time_ms assert len(fast_times) > 5, "Expected to see more fast ping times (higher than %d)" % high_time_ms @cluster(num_nodes=5) @parametrize(task_name="rate-1000", device_name="eth0", latency_ms=0, rate_limit_kbit=1000000) @parametrize(task_name="rate-1000-latency-50", device_name="eth0", latency_ms=50, rate_limit_kbit=1000000) def test_rate(self, task_name, device_name, latency_ms, rate_limit_kbit): zk0 = self.zk.nodes[0] zk1 = self.zk.nodes[1] spec = DegradedNetworkFaultSpec(0, 60000) spec.add_node_spec(zk0.name, device_name, latency_ms, rate_limit_kbit) # start the task and wait rate_limit = self.trogdor.create_task(task_name, spec) wait_until(lambda: rate_limit.running(), timeout_sec=10, err_msg="%s failed to start within 10 seconds." % rate_limit) # Run iperf server on zk1, iperf client on zk0 iperf_server = zk1.account.ssh_capture("iperf -s") # Capture the measured kbps between the two nodes. # [ 3] 0.0- 1.0 sec 2952576 KBytes 24187503 Kbits/sec r = re.compile(r"^.*\s(?P<rate>[\d.]+)\sKbits/sec$") measured_rates = [] for line in zk0.account.ssh_capture("iperf -i 1 -t 20 -f k -c %s" % zk1.account.hostname): self.logger.info("iperf output %s" % line) m = r.match(line) if m is not None: measured_rate = float(m.group("rate")) measured_rates.append(measured_rate) self.logger.info("Parsed rate of %d kbit/s from iperf" % measured_rate) # kill iperf server and consume the stdout to ensure clean exit zk1.account.kill_process("iperf") for _ in iperf_server: continue rate_limit.stop() rate_limit.wait_for_done() self.logger.info("Measured rates: %s" % measured_rates) # We expect to see measured rates within an order of magnitude of our target rate low_kbps = rate_limit_kbit // 10 high_kbps = rate_limit_kbit * 10 acceptable_rates = [r for r in measured_rates if low_kbps < r < high_kbps] msg = "Expected most of the measured rates to be within an order of magnitude of target %d." % rate_limit_kbit msg += " This means `tc` did not limit the bandwidth as expected." assert len(acceptable_rates) > 5, msg
class ConsumeBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService( test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService( test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService( test_context, self.kafka) self.active_topics = { "consume_bench_topic[0-5]": { "numPartitions": 5, "replicationFactor": 3 } } self.trogdor = TrogdorService(context=self.test_context, client_services=[ self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2 ]) def setUp(self): self.trogdor.start() if self.zk: self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() if self.zk: self.zk.stop() def produce_messages(self, topics, max_messages=10000): produce_spec = ProduceBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.producer_workload_service.producer_node, self.producer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=max_messages, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics=topics) produce_workload = self.trogdor.create_task("produce_workload", produce_spec) produce_workload.wait_for_done(timeout_sec=180) self.logger.debug("Produce workload finished") @cluster(num_nodes=10) @matrix(topics=[["consume_bench_topic[0-5]"]], metadata_quorum=quorum.all_non_upgrade) # topic subscription @matrix(topics=[["consume_bench_topic[0-5]:[0-4]"]], metadata_quorum=quorum.all_non_upgrade) # manual topic assignment def test_consume_bench(self, topics, metadata_quorum=quorum.zk): """ Runs a ConsumeBench workload to consume messages """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=10000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=topics) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_single_partition(self, metadata_quorum=quorum.zk): """ Run a ConsumeBench against a single partition """ active_topics = { "consume_bench_topic": { "numPartitions": 2, "replicationFactor": 3 } } self.produce_messages(active_topics, 5000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2500, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["consume_bench_topic:1"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=180) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_multiple_consumers_random_group_topics(self, metadata_quorum=quorum.zk): """ Runs multiple consumers group to read messages from topics. Since a consumerGroup isn't specified, each consumer should read from all topics independently """ self.produce_messages(self.active_topics, max_messages=5000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=5000, # all should read exactly 5k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=5, active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_two_consumers_specified_group_topics(self, metadata_quorum=quorum.zk): """ Runs two consumers in the same consumer group to read messages from topics. Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, # both should read at least 2k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=2, consumer_group="testGroup", active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_multiple_consumers_random_group_partitions( self, metadata_quorum=quorum.zk): """ Runs multiple consumers in to read messages from specific partitions. Since a consumerGroup isn't specified, each consumer will get assigned a random group and consume from all partitions """ self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) @cluster(num_nodes=10) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_multiple_consumers_specified_group_partitions_should_raise( self, metadata_quorum=quorum.zk): """ Runs multiple consumers in the same group to read messages from specific partitions. It is an invalid configuration to provide a consumer group and specific partitions. """ expected_error_msg = 'explicit partition assignment' self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec( 0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, consumer_group="fail_group", active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) try: consume_workload.wait_for_done(timeout_sec=360) raise Exception( "Should have raised an exception due to an invalid configuration" ) except RuntimeError as e: if expected_error_msg not in str(e): raise RuntimeError("Unexpected Exception - " + str(e)) self.logger.info(e)
class ConsumeBenchTest(Test): def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ConsumeBenchTest, self).__init__(test_context) self.zk = ZookeeperService(test_context, num_nodes=3) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk) self.producer_workload_service = ProduceBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service = ConsumeBenchWorkloadService(test_context, self.kafka) self.consumer_workload_service_2 = ConsumeBenchWorkloadService(test_context, self.kafka) self.active_topics = {"consume_bench_topic[0-5]": {"numPartitions": 5, "replicationFactor": 3}} self.trogdor = TrogdorService(context=self.test_context, client_services=[self.kafka, self.producer_workload_service, self.consumer_workload_service, self.consumer_workload_service_2]) def setUp(self): self.trogdor.start() self.zk.start() self.kafka.start() def teardown(self): self.trogdor.stop() self.kafka.stop() self.zk.stop() def produce_messages(self, topics, max_messages=10000): produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.producer_workload_service.producer_node, self.producer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=max_messages, producer_conf={}, admin_client_conf={}, common_client_conf={}, inactive_topics={}, active_topics=topics) produce_workload = self.trogdor.create_task("produce_workload", produce_spec) produce_workload.wait_for_done(timeout_sec=180) self.logger.debug("Produce workload finished") @parametrize(topics=["consume_bench_topic[0-5]"]) # topic subscription @parametrize(topics=["consume_bench_topic[0-5]:[0-4]"]) # manual topic assignment def test_consume_bench(self, topics): """ Runs a ConsumeBench workload to consume messages """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=10000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=topics) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_single_partition(self): """ Run a ConsumeBench against a single partition """ active_topics = {"consume_bench_topic": {"numPartitions": 2, "replicationFactor": 3}} self.produce_messages(active_topics, 5000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2500, consumer_conf={}, admin_client_conf={}, common_client_conf={}, active_topics=["consume_bench_topic:1"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=180) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_multiple_consumers_random_group_topics(self): """ Runs multiple consumers group to read messages from topics. Since a consumerGroup isn't specified, each consumer should read from all topics independently """ self.produce_messages(self.active_topics, max_messages=5000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=5000, # all should read exactly 5k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=5, active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_two_consumers_specified_group_topics(self): """ Runs two consumers in the same consumer group to read messages from topics. Since a consumerGroup is specified, each consumer should dynamically get assigned a partition from group """ self.produce_messages(self.active_topics) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, # both should read at least 2k messages consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=2, consumer_group="testGroup", active_topics=["consume_bench_topic[0-5]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_multiple_consumers_random_group_partitions(self): """ Runs multiple consumers in to read messages from specific partitions. Since a consumerGroup isn't specified, each consumer will get assigned a random group and consume from all partitions """ self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) consume_workload.wait_for_done(timeout_sec=360) self.logger.debug("Consume workload finished") tasks = self.trogdor.tasks() self.logger.info("TASKS: %s\n" % json.dumps(tasks, sort_keys=True, indent=2)) def test_multiple_consumers_specified_group_partitions_should_raise(self): """ Runs multiple consumers in the same group to read messages from specific partitions. It is an invalid configuration to provide a consumer group and specific partitions. """ expected_error_msg = 'explicit partition assignment' self.produce_messages(self.active_topics, max_messages=20000) consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS, self.consumer_workload_service.consumer_node, self.consumer_workload_service.bootstrap_servers, target_messages_per_sec=1000, max_messages=2000, consumer_conf={}, admin_client_conf={}, common_client_conf={}, threads_per_worker=4, consumer_group="fail_group", active_topics=["consume_bench_topic1:[0-4]"]) consume_workload = self.trogdor.create_task("consume_workload", consume_spec) try: consume_workload.wait_for_done(timeout_sec=360) raise Exception("Should have raised an exception due to an invalid configuration") except RuntimeError as e: if expected_error_msg not in str(e): raise RuntimeError("Unexpected Exception - " + str(e)) self.logger.info(e)