def move_start_offset(self): """We move the start offset of the topic by writing really old messages and waiting for them to be cleaned up. """ producer = VerifiableProducer(self.test_context, 1, self.kafka, self.topic, throughput=-1, enable_idempotence=True, create_time=1000) producer.start() wait_until(lambda: producer.num_acked > 0, timeout_sec=30, err_msg="Failed to get an acknowledgement for %ds" % 30) # Wait 8 seconds to let the topic be seeded with messages that will # be deleted. The 8 seconds is important, since we should get 2 deleted # segments in this period based on the configured log roll time and the # retention check interval. time.sleep(8) producer.stop() self.logger.info("Seeded topic with %d messages which will be deleted" %\ producer.num_acked) # Since the configured check interval is 5 seconds, we wait another # 6 seconds to ensure that at least one more cleaning so that the last # segment is deleted. An altenate to using timeouts is to poll each # partition until the log start offset matches the end offset. The # latter is more robust. time.sleep(6)
class GetOffsetShellTest(Test): """ Tests GetOffsetShell tool """ def __init__(self, test_context): super(GetOffsetShellTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.messages_received_count = 0 self.topics = { TOPIC: {'partitions': NUM_PARTITIONS, 'replication-factor': REPLICATION_FACTOR} } self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_producer(self): # This will produce to kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, max_messages=MAX_MESSAGES) self.producer.start() current_acked = self.producer.num_acked wait_until(lambda: self.producer.num_acked >= current_acked + MAX_MESSAGES, timeout_sec=10, err_msg="Timeout awaiting messages to be produced and acked") def start_consumer(self): self.consumer = ConsoleConsumer(self.test_context, num_nodes=self.num_brokers, kafka=self.kafka, topic=TOPIC, consumer_timeout_ms=1000) self.consumer.start() @cluster(num_nodes=4) def test_get_offset_shell(self, security_protocol='PLAINTEXT'): """ Tests if GetOffsetShell is getting offsets correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer() # Assert that offset fetched without any consumers consuming is 0 assert self.kafka.get_offset_shell(TOPIC, None, 1000, 1, -1), "%s:%s:%s" % (TOPIC, NUM_PARTITIONS - 1, 0) self.start_consumer() node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=20, backoff_sec=.2, err_msg="Consumer was too slow to start") # Assert that offset is correctly indicated by GetOffsetShell tool wait_until(lambda: "%s:%s:%s" % (TOPIC, NUM_PARTITIONS - 1, MAX_MESSAGES) in self.kafka.get_offset_shell(TOPIC, None, 1000, 1, -1), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.")
def move_start_offset(self): """We move the start offset of the topic by writing really old messages and waiting for them to be cleaned up. """ producer = VerifiableProducer(self.test_context, 1, self.kafka, self.topic, throughput=-1, enable_idempotence=True, create_time=1000) producer.start() wait_until(lambda: producer.num_acked > 0, timeout_sec=30, err_msg="Failed to get an acknowledgement for %ds" % 30) # Wait 8 seconds to let the topic be seeded with messages that will # be deleted. The 8 seconds is important, since we should get 2 deleted # segments in this period based on the configured log roll time and the # retention check interval. time.sleep(8) producer.stop() self.logger.info("Seeded topic with %d messages which will be deleted" %\ producer.num_acked) # Since the configured check interval is 5 seconds, we wait another # 6 seconds to ensure that at least one more cleaning so that the last # segment is deleted. An altenate to using timeouts is to poll each # partition until the log start offset matches the end offset. The # latter is more robust. time.sleep(6)
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService( test_context, num_nodes=1, zk=self.zk, topics={self.topic: { "partitions": 1, "replication-factor": 1 }}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=1000) def setUp(self): self.zk.start() self.kafka.start() @cluster(num_nodes=3) @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(DEV_BRANCH)) def test_simple_run(self, producer_version=DEV_BRANCH): """ Test that we can start VerifiableProducer on the current branch snapshot version or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ node = self.producer.nodes[0] node.version = KafkaVersion(producer_version) self.producer.start() wait_until( lambda: self.producer.num_acked > 5, timeout_sec=5, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with DEV_BRANCH # When running VerifiableProducer 0.8.X, both the current branch version and 0.8.X should show up because of the # way verifiable producer pulls in some development directories into its classpath if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, DEV_BRANCH.vstring]) else: assert is_version(node, [node.version.vstring]) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % ( num_produced, self.num_messages)
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=self.num_messages/5) def setUp(self): self.zk.start() self.kafka.start() @cluster(num_nodes=3) @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(LATEST_0_10_0)) @parametrize(producer_version=str(LATEST_0_10_1)) @parametrize(producer_version=str(DEV_BRANCH)) def test_simple_run(self, producer_version=DEV_BRANCH): """ Test that we can start VerifiableProducer on the current branch snapshot version or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ node = self.producer.nodes[0] node.version = KafkaVersion(producer_version) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with DEV_BRANCH # When running VerifiableProducer 0.8.X, both the current branch version and 0.8.X should show up because of the # way verifiable producer pulls in some development directories into its classpath # # If the test fails here because 'ps .. | grep' couldn't find the process it means # the login and grep that is_version() performs is slower than # the time it takes the producer to produce its messages. # Easy fix is to decrease throughput= above, the good fix is to make the producer # not terminate until explicitly killed in this case. if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, DEV_BRANCH.vstring], logger=self.logger) else: assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
class SimpleConsumerShellTest(Test): """ Tests SimpleConsumerShell tool """ def __init__(self, test_context): super(SimpleConsumerShellTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.messages_received_count = 0 self.topics = {TOPIC: {"partitions": NUM_PARTITIONS, "replication-factor": REPLICATION_FACTOR}} self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) self.kafka.start() def run_producer(self): # This will produce to kafka cluster self.producer = VerifiableProducer( self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, max_messages=MAX_MESSAGES ) self.producer.start() wait_until( lambda: self.producer.num_acked == MAX_MESSAGES, timeout_sec=10, err_msg="Timeout awaiting messages to be produced and acked", ) def start_simple_consumer_shell(self): self.simple_consumer_shell = SimpleConsumerShell(self.test_context, 1, self.kafka, TOPIC) self.simple_consumer_shell.start() def test_simple_consumer_shell(self): """ Tests if SimpleConsumerShell is fetching expected records :return: None """ self.start_kafka() self.run_producer() self.start_simple_consumer_shell() # Assert that SimpleConsumerShell is fetching expected number of messages wait_until( lambda: self.simple_consumer_shell.get_output().count("\n") == (MAX_MESSAGES + 1), timeout_sec=10, err_msg="Timed out waiting to receive expected number of messages.", )
def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked
def seed_messages(self, topic, num_seed_messages): seed_timeout_sec = 10000 seed_producer = VerifiableProducer(context=self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, message_validator=is_int, max_messages=num_seed_messages, enable_idempotence=True) seed_producer.start() wait_until(lambda: seed_producer.num_acked >= num_seed_messages, timeout_sec=seed_timeout_sec, err_msg="Producer failed to produce messages %d in %ds." %\ (self.num_seed_messages, seed_timeout_sec)) return seed_producer.acked
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=1000) def setUp(self): self.zk.start() self.kafka.start() @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(TRUNK)) def test_simple_run(self, producer_version=TRUNK): """ Test that we can start VerifiableProducer on trunk or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ node = self.producer.nodes[0] node.version = KafkaVersion(producer_version) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with TRUNK # When running VerifiableProducer 0.8.X, both trunk version and 0.8.X should show up because of the way # verifiable producer pulls in some trunk directories into its classpath if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, TRUNK.vstring]) else: assert is_version(node, [node.version.vstring]) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
class TestVerifiableProducer(Test): """Sanity checks on verifiable producer service class.""" def __init__(self, test_context): super(TestVerifiableProducer, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test(test_context) == quorum.zk else None self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=self.num_messages // 10) def setUp(self): if self.zk: self.zk.start() @cluster(num_nodes=3) @parametrize(producer_version=str(LATEST_0_8_2)) @parametrize(producer_version=str(LATEST_0_9)) @parametrize(producer_version=str(LATEST_0_10_0)) @parametrize(producer_version=str(LATEST_0_10_1)) @matrix(producer_version=[str(DEV_BRANCH)], security_protocol=['PLAINTEXT', 'SSL'], metadata_quorum=quorum.all) @cluster(num_nodes=4) @matrix(producer_version=[str(DEV_BRANCH)], security_protocol=['SASL_SSL'], sasl_mechanism=['PLAIN', 'GSSAPI'], metadata_quorum=quorum.all) def test_simple_run(self, producer_version, security_protocol = 'PLAINTEXT', sasl_mechanism='PLAIN', metadata_quorum=quorum.zk): """ Test that we can start VerifiableProducer on the current branch snapshot version or against the 0.8.2 jar, and verify that we can produce a small number of messages. """ self.kafka.security_protocol = security_protocol self.kafka.client_sasl_mechanism = sasl_mechanism self.kafka.interbroker_security_protocol = security_protocol self.kafka.interbroker_sasl_mechanism = sasl_mechanism if self.kafka.quorum_info.using_kraft: controller_quorum = self.kafka.controller_quorum controller_quorum.controller_security_protocol = security_protocol controller_quorum.controller_sasl_mechanism = sasl_mechanism controller_quorum.intercontroller_security_protocol = security_protocol controller_quorum.intercontroller_sasl_mechanism = sasl_mechanism self.kafka.start() node = self.producer.nodes[0] node.version = KafkaVersion(producer_version) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg="Producer failed to start in a reasonable amount of time.") # using version.vstring (distutils.version.LooseVersion) is a tricky way of ensuring # that this check works with DEV_BRANCH # When running VerifiableProducer 0.8.X, both the current branch version and 0.8.X should show up because of the # way verifiable producer pulls in some development directories into its classpath # # If the test fails here because 'ps .. | grep' couldn't find the process it means # the login and grep that is_version() performs is slower than # the time it takes the producer to produce its messages. # Easy fix is to decrease throughput= above, the good fix is to make the producer # not terminate until explicitly killed in this case. if node.version <= LATEST_0_8_2: assert is_version(node, [node.version.vstring, DEV_BRANCH.vstring], logger=self.logger) else: assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages) @cluster(num_nodes=4) @matrix(inter_broker_security_protocol=['PLAINTEXT', 'SSL'], metadata_quorum=[quorum.remote_kraft]) @matrix(inter_broker_security_protocol=['SASL_SSL'], inter_broker_sasl_mechanism=['PLAIN', 'GSSAPI'], metadata_quorum=[quorum.remote_kraft]) def test_multiple_kraft_security_protocols( self, inter_broker_security_protocol, inter_broker_sasl_mechanism='GSSAPI', metadata_quorum=quorum.remote_kraft): """ Test for remote KRaft cases that we can start VerifiableProducer on the current branch snapshot version, and verify that we can produce a small number of messages. The inter-controller and broker-to-controller security protocols are defined to be different (which differs from the above test, where they were the same). """ self.kafka.security_protocol = self.kafka.interbroker_security_protocol = inter_broker_security_protocol self.kafka.client_sasl_mechanism = self.kafka.interbroker_sasl_mechanism = inter_broker_sasl_mechanism controller_quorum = self.kafka.controller_quorum sasl_mechanism = 'PLAIN' if inter_broker_sasl_mechanism == 'GSSAPI' else 'GSSAPI' if inter_broker_security_protocol == 'PLAINTEXT': controller_security_protocol = 'SSL' intercontroller_security_protocol = 'SASL_SSL' elif inter_broker_security_protocol == 'SSL': controller_security_protocol = 'SASL_SSL' intercontroller_security_protocol = 'PLAINTEXT' else: # inter_broker_security_protocol == 'SASL_SSL' controller_security_protocol = 'PLAINTEXT' intercontroller_security_protocol = 'SSL' controller_quorum.controller_security_protocol = controller_security_protocol controller_quorum.controller_sasl_mechanism = sasl_mechanism controller_quorum.intercontroller_security_protocol = intercontroller_security_protocol controller_quorum.intercontroller_sasl_mechanism = sasl_mechanism self.kafka.start() node = self.producer.nodes[0] node.version = KafkaVersion(str(DEV_BRANCH)) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg="Producer failed to start in a reasonable amount of time.") # See above comment above regarding use of version.vstring (distutils.version.LooseVersion) assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages) @cluster(num_nodes=4) @parametrize(metadata_quorum=quorum.remote_kraft) def test_multiple_kraft_sasl_mechanisms(self, metadata_quorum): """ Test for remote KRaft cases that we can start VerifiableProducer on the current branch snapshot version, and verify that we can produce a small number of messages. The inter-controller and broker-to-controller security protocols are both SASL_PLAINTEXT but the SASL mechanisms are different (we set GSSAPI for the inter-controller mechanism and PLAIN for the broker-to-controller mechanism). This test differs from the above tests -- he ones above used the same SASL mechanism for both paths. """ self.kafka.security_protocol = self.kafka.interbroker_security_protocol = 'PLAINTEXT' controller_quorum = self.kafka.controller_quorum controller_quorum.controller_security_protocol = 'SASL_PLAINTEXT' controller_quorum.controller_sasl_mechanism = 'PLAIN' controller_quorum.intercontroller_security_protocol = 'SASL_PLAINTEXT' controller_quorum.intercontroller_sasl_mechanism = 'GSSAPI' self.kafka.start() node = self.producer.nodes[0] node.version = KafkaVersion(str(DEV_BRANCH)) self.producer.start() wait_until(lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg="Producer failed to start in a reasonable amount of time.") # See above comment above regarding use of version.vstring (distutils.version.LooseVersion) assert is_version(node, [node.version.vstring], logger=self.logger) self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % (num_produced, self.num_messages)
class TestMirrorMakerService(Test): """Sanity checks on mirror maker service class.""" def __init__(self, test_context): super(TestMirrorMakerService, self).__init__(test_context) self.topic = "topic" self.source_zk = ZookeeperService(test_context, num_nodes=1) self.target_zk = ZookeeperService(test_context, num_nodes=1) self.source_kafka = KafkaService( test_context, num_nodes=1, zk=self.source_zk, topics={self.topic: { "partitions": 1, "replication-factor": 1 }}) self.target_kafka = KafkaService( test_context, num_nodes=1, zk=self.target_zk, topics={self.topic: { "partitions": 1, "replication-factor": 1 }}) self.num_messages = 1000 # This will produce to source kafka cluster self.producer = VerifiableProducer(test_context, num_nodes=1, kafka=self.source_kafka, topic=self.topic, max_messages=self.num_messages, throughput=1000) # Use a regex whitelist to check that the start command is well-formed in this case self.mirror_maker = MirrorMaker(test_context, num_nodes=1, source=self.source_kafka, target=self.target_kafka, whitelist=".*", consumer_timeout_ms=2000) # This will consume from target kafka cluster self.consumer = ConsoleConsumer(test_context, num_nodes=1, kafka=self.target_kafka, topic=self.topic, consumer_timeout_ms=1000) def setUp(self): # Source cluster self.source_zk.start() self.source_kafka.start() # Target cluster self.target_zk.start() self.target_kafka.start() def test_end_to_end(self): """ Test end-to-end behavior under non-failure conditions. Setup: two single node Kafka clusters, each connected to its own single node zookeeper cluster. One is source, and the other is target. Single-node mirror maker mirrors from source to target. - Start mirror maker. - Produce a small number of messages to the source cluster. - Consume messages from target. - Verify that number of consumed messages matches the number produced. """ self.mirror_maker.start() # Check that consumer_timeout_ms setting made it to config file self.mirror_maker.nodes[0].account.ssh( "grep \"consumer\.timeout\.ms\" %s" % MirrorMaker.CONSUMER_CONFIG, allow_fail=False) self.producer.start() self.producer.wait(10) self.consumer.start() self.consumer.wait(10) num_consumed = len(self.consumer.messages_consumed[1]) num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % ( num_produced, self.num_messages) assert num_produced == num_consumed, "num_produced: %d, num_consumed: %d" % ( num_produced, num_consumed) self.mirror_maker.stop()
class GetOffsetShellTest(Test): """ Tests GetOffsetShell tool """ def __init__(self, test_context): super(GetOffsetShellTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.messages_received_count = 0 self.topics = { TOPIC_TEST_NAME: { 'partitions': NUM_PARTITIONS, 'replication-factor': REPLICATION_FACTOR }, TOPIC_TEST_PATTERN1: { 'partitions': 1, 'replication-factor': REPLICATION_FACTOR }, TOPIC_TEST_PATTERN2: { 'partitions': 1, 'replication-factor': REPLICATION_FACTOR }, TOPIC_TEST_PARTITIONS: { 'partitions': 2, 'replication-factor': REPLICATION_FACTOR }, TOPIC_TEST_INTERNAL_FILTER: { 'partitions': 1, 'replication-factor': REPLICATION_FACTOR }, TOPIC_TEST_TOPIC_PARTITIONS1: { 'partitions': 2, 'replication-factor': REPLICATION_FACTOR }, TOPIC_TEST_TOPIC_PARTITIONS2: { 'partitions': 2, 'replication-factor': REPLICATION_FACTOR } } self.zk = ZookeeperService(test_context, self.num_zk) if quorum.for_test( test_context) == quorum.zk else None def setUp(self): if self.zk: self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_producer(self, topic): # This will produce to kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=topic, throughput=1000, max_messages=MAX_MESSAGES, repeating_keys=MAX_MESSAGES) self.producer.start() current_acked = self.producer.num_acked wait_until( lambda: self.producer.num_acked >= current_acked + MAX_MESSAGES, timeout_sec=10, err_msg="Timeout awaiting messages to be produced and acked") def start_consumer(self, topic): self.consumer = ConsoleConsumer(self.test_context, num_nodes=self.num_brokers, kafka=self.kafka, topic=topic, consumer_timeout_ms=1000) self.consumer.start() def check_message_count_sum_equals(self, message_count, **kwargs): sum = self.extract_message_count_sum(**kwargs) return sum == message_count def extract_message_count_sum(self, **kwargs): offsets = self.kafka.get_offset_shell(**kwargs).split("\n") sum = 0 for offset in offsets: if len(offset) == 0: continue sum += int(offset.split(":")[-1]) return sum @cluster(num_nodes=3) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_get_offset_shell_topic_name(self, security_protocol='PLAINTEXT', metadata_quorum=quorum.zk): """ Tests if GetOffsetShell handles --topic argument with a simple name correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer(TOPIC_TEST_NAME) # Assert that offset is correctly indicated by GetOffsetShell tool wait_until(lambda: self.check_message_count_sum_equals( MAX_MESSAGES, topic=TOPIC_TEST_NAME), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") @cluster(num_nodes=4) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_get_offset_shell_topic_pattern(self, security_protocol='PLAINTEXT', metadata_quorum=quorum.zk): """ Tests if GetOffsetShell handles --topic argument with a pattern correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer(TOPIC_TEST_PATTERN1) self.start_producer(TOPIC_TEST_PATTERN2) # Assert that offset is correctly indicated by GetOffsetShell tool wait_until(lambda: self.check_message_count_sum_equals( 2 * MAX_MESSAGES, topic=TOPIC_TEST_PATTERN_PATTERN), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") @cluster(num_nodes=3) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_get_offset_shell_partitions(self, security_protocol='PLAINTEXT', metadata_quorum=quorum.zk): """ Tests if GetOffsetShell handles --partitions argument correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer(TOPIC_TEST_PARTITIONS) def fetch_and_sum_partitions_separately(): partition_count0 = self.extract_message_count_sum( topic=TOPIC_TEST_PARTITIONS, partitions="0") partition_count1 = self.extract_message_count_sum( topic=TOPIC_TEST_PARTITIONS, partitions="1") return partition_count0 + partition_count1 == MAX_MESSAGES # Assert that offset is correctly indicated when fetching partitions one by one wait_until(fetch_and_sum_partitions_separately, timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") # Assert that offset is correctly indicated when fetching partitions together wait_until(lambda: self.check_message_count_sum_equals( MAX_MESSAGES, topic=TOPIC_TEST_PARTITIONS), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") @cluster(num_nodes=4) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_get_offset_shell_topic_partitions(self, security_protocol='PLAINTEXT', metadata_quorum=quorum.zk): """ Tests if GetOffsetShell handles --topic-partitions argument correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer(TOPIC_TEST_TOPIC_PARTITIONS1) self.start_producer(TOPIC_TEST_TOPIC_PARTITIONS2) # Assert that a single topic pattern matches all 4 partitions wait_until(lambda: self.check_message_count_sum_equals( 2 * MAX_MESSAGES, topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS_PATTERN), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") # Assert that a topic pattern with partition range matches all 4 partitions wait_until(lambda: self.check_message_count_sum_equals( 2 * MAX_MESSAGES, topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS_PATTERN + ":0-2"), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") # Assert that 2 separate topic patterns match all 4 partitions wait_until(lambda: self.check_message_count_sum_equals( 2 * MAX_MESSAGES, topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS1 + "," + TOPIC_TEST_TOPIC_PARTITIONS2), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") # Assert that 4 separate topic-partition patterns match all 4 partitions wait_until(lambda: self.check_message_count_sum_equals( 2 * MAX_MESSAGES, topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS1 + ":0," + TOPIC_TEST_TOPIC_PARTITIONS1 + ":1," + TOPIC_TEST_TOPIC_PARTITIONS2 + ":0," + TOPIC_TEST_TOPIC_PARTITIONS2 + ":1"), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") # Assert that only partitions #0 are matched with topic pattern and fix partition number filtered_partitions = self.kafka.get_offset_shell( topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS_PATTERN + ":0") assert 1 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS1, 0)) assert 0 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS1, 1)) assert 1 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS2, 0)) assert 0 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS2, 1)) # Assert that only partitions #1 are matched with topic pattern and partition lower bound filtered_partitions = self.kafka.get_offset_shell( topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS_PATTERN + ":1-") assert 1 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS1, 1)) assert 0 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS1, 0)) assert 1 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS2, 1)) assert 0 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS2, 0)) # Assert that only partitions #0 are matched with topic pattern and partition upper bound filtered_partitions = self.kafka.get_offset_shell( topic_partitions=TOPIC_TEST_TOPIC_PARTITIONS_PATTERN + ":-1") assert 1 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS1, 0)) assert 0 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS1, 1)) assert 1 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS2, 0)) assert 0 == filtered_partitions.count( "%s:%s" % (TOPIC_TEST_TOPIC_PARTITIONS2, 1)) @cluster(num_nodes=4) @matrix(metadata_quorum=quorum.all_non_upgrade) def test_get_offset_shell_internal_filter(self, security_protocol='PLAINTEXT', metadata_quorum=quorum.zk): """ Tests if GetOffsetShell handles --exclude-internal-topics flag correctly :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_producer(TOPIC_TEST_INTERNAL_FILTER) # Create consumer and poll messages to create consumer offset record self.start_consumer(TOPIC_TEST_INTERNAL_FILTER) node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=20, backoff_sec=.2, err_msg="Consumer was too slow to start") # Assert that a single topic pattern matches all 4 partitions wait_until(lambda: self.check_message_count_sum_equals( MAX_MESSAGES, topic_partitions=TOPIC_TEST_INTERNAL_FILTER), timeout_sec=10, err_msg="Timed out waiting to reach expected offset.") # No filters # Assert that without exclusion, we can find both the test topic and the __consumer_offsets internal topic offset_output = self.kafka.get_offset_shell() assert "__consumer_offsets" in offset_output assert TOPIC_TEST_INTERNAL_FILTER in offset_output # Assert that with exclusion, we can find the test topic but not the __consumer_offsets internal topic offset_output = self.kafka.get_offset_shell( exclude_internal_topics=True) assert "__consumer_offsets" not in offset_output assert TOPIC_TEST_INTERNAL_FILTER in offset_output # Topic filter # Assert that without exclusion, we can find both the test topic and the __consumer_offsets internal topic offset_output = self.kafka.get_offset_shell(topic=".*consumer_offsets") assert "__consumer_offsets" in offset_output assert TOPIC_TEST_INTERNAL_FILTER in offset_output # Assert that with exclusion, we can find the test topic but not the __consumer_offsets internal topic offset_output = self.kafka.get_offset_shell( topic=".*consumer_offsets", exclude_internal_topics=True) assert "__consumer_offsets" not in offset_output assert TOPIC_TEST_INTERNAL_FILTER in offset_output # Topic-partition filter # Assert that without exclusion, we can find both the test topic and the __consumer_offsets internal topic offset_output = self.kafka.get_offset_shell( topic_partitions=".*consumer_offsets:0") assert "__consumer_offsets" in offset_output assert TOPIC_TEST_INTERNAL_FILTER in offset_output # Assert that with exclusion, we can find the test topic but not the __consumer_offsets internal topic offset_output = self.kafka.get_offset_shell( topic_partitions=".*consumer_offsets:0", exclude_internal_topics=True) assert "__consumer_offsets" not in offset_output assert TOPIC_TEST_INTERNAL_FILTER in offset_output
class DelegationTokenTest(Test): def __init__(self, test_context): super(DelegationTokenTest, self).__init__(test_context) self.test_context = test_context self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService( self.test_context, num_nodes=1, zk=self.zk, zk_chroot="/kafka", topics={self.topic: { "partitions": 1, "replication-factor": 1 }}, server_prop_overides=[[ config_property.DELEGATION_TOKEN_MAX_LIFETIME_MS, "604800000" ], [config_property.DELEGATION_TOKEN_EXPIRY_TIME_MS, "86400000" ], [config_property.DELEGATION_TOKEN_SECRET_KEY, "test12345"], [ config_property.SASL_ENABLED_MECHANISMS, "GSSAPI,SCRAM-SHA-256" ]]) self.jaas_deleg_conf_path = "/tmp/jaas_deleg.conf" self.jaas_deleg_conf = "" self.client_properties_content = """ security.protocol=SASL_PLAINTEXT sasl.mechanism=SCRAM-SHA-256 sasl.kerberos.service.name=kafka client.id=console-consumer """ self.client_kafka_opts = ' -Djava.security.auth.login.config=' + self.jaas_deleg_conf_path self.producer = VerifiableProducer( self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=1, throughput=1, kafka_opts_override=self.client_kafka_opts, client_prop_file_override=self.client_properties_content) self.consumer = ConsoleConsumer( self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, kafka_opts_override=self.client_kafka_opts, client_prop_file_override=self.client_properties_content) self.kafka.security_protocol = 'SASL_PLAINTEXT' self.kafka.client_sasl_mechanism = 'GSSAPI,SCRAM-SHA-256' self.kafka.interbroker_sasl_mechanism = 'GSSAPI' def setUp(self): self.zk.start() def tearDown(self): self.producer.nodes[0].account.remove(self.jaas_deleg_conf_path) self.consumer.nodes[0].account.remove(self.jaas_deleg_conf_path) def generate_delegation_token(self): self.logger.debug("Request delegation token") self.delegation_tokens.generate_delegation_token() self.jaas_deleg_conf = self.delegation_tokens.create_jaas_conf_with_delegation_token( ) def expire_delegation_token(self): self.kafka.client_sasl_mechanism = 'GSSAPI,SCRAM-SHA-256' token_hmac = self.delegation_tokens.token_hmac() self.delegation_tokens.expire_delegation_token(token_hmac) def produce_with_delegation_token(self): self.producer.acked_values = [] self.producer.nodes[0].account.create_file(self.jaas_deleg_conf_path, self.jaas_deleg_conf) self.logger.debug(self.jaas_deleg_conf) self.producer.start() def consume_with_delegation_token(self): self.logger.debug("Consume messages with delegation token") self.consumer.nodes[0].account.create_file(self.jaas_deleg_conf_path, self.jaas_deleg_conf) self.logger.debug(self.jaas_deleg_conf) self.consumer.consumer_timeout_ms = 5000 self.consumer.start() self.consumer.wait() def get_datetime_ms(self, input_date): return int( time.mktime( datetime.strptime(input_date, "%Y-%m-%dT%H:%M").timetuple()) * 1000) def renew_delegation_token(self): dt = self.delegation_tokens.parse_delegation_token_out() orig_expiry_date_ms = self.get_datetime_ms(dt["expirydate"]) new_expirydate_ms = orig_expiry_date_ms + 1000 self.delegation_tokens.renew_delegation_token(dt["hmac"], new_expirydate_ms) @cluster(num_nodes=5) def test_delegation_token_lifecycle(self): self.kafka.start() self.delegation_tokens = DelegationTokens(self.kafka, self.test_context) self.generate_delegation_token() self.renew_delegation_token() self.produce_with_delegation_token() wait_until(lambda: self.producer.num_acked > 0, timeout_sec=30, err_msg="Expected producer to still be producing.") assert 1 == self.producer.num_acked, "number of acked messages: %d" % self.producer.num_acked self.consume_with_delegation_token() num_consumed = len(self.consumer.messages_consumed[1]) assert 1 == num_consumed, "number of consumed messages: %d" % num_consumed self.expire_delegation_token() self.produce_with_delegation_token() assert 0 == self.producer.num_acked, "number of acked messages: %d" % self.producer.num_acked
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' stopped_message = 'OPTIMIZE_TEST Streams Stopped' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 }, self.reduce_topic: { 'partitions': 6 }, self.join_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors unoptimized for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.verify_processing(processors, verify_individual_operations=False) stop_processors(processors, self.stopped_message) # start again with topology optimized for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.verify_processing(processors, verify_individual_operations=True) stop_processors(processors, self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg= "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): for processor in processors: if not self.all_source_subtopology_tasks(processor): if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) else: self.logger.info( "Skipping processor %s with all source tasks" % processor.node.account) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def all_source_subtopology_tasks(self, processor): retries = 0 while retries < 5: found = list( processor.node.account.ssh_capture( "sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s" % processor.LOG_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found) if len(found) > 0: return True retries += 1 time.sleep(1) return False def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic
class ReplicaVerificationToolTest(Test): """ Tests ReplicaVerificationTool """ def __init__(self, test_context): super(ReplicaVerificationToolTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 2 self.messages_received_count = 0 self.topics = {TOPIC: {'partitions': 1, 'replication-factor': 2}} self.zk = ZookeeperService(test_context, self.num_zk) self.kafka = None self.producer = None self.replica_verifier = None def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_replica_verification_tool(self, security_protocol): self.replica_verifier = ReplicaVerificationTool( self.test_context, 1, self.kafka, TOPIC, report_interval_ms=REPORT_INTERVAL_MS, security_protocol=security_protocol) self.replica_verifier.start() def start_producer(self, max_messages, acks, timeout): # This will produce to kafka cluster current_acked = 0 self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, acks=acks, max_messages=max_messages) self.producer.start() wait_until( lambda: acks == 0 or self.producer.num_acked >= current_acked + max_messages, timeout_sec=timeout, err_msg="Timeout awaiting messages to be produced and acked") def stop_producer(self): self.producer.stop() @cluster(num_nodes=6) def test_replica_lags(self, security_protocol='PLAINTEXT'): """ Tests ReplicaVerificationTool :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_replica_verification_tool(security_protocol) self.start_producer(max_messages=10, acks=-1, timeout=15) # Verify that there is no lag in replicas and is correctly reported by ReplicaVerificationTool wait_until( lambda: self.replica_verifier.get_lag_for_partition(TOPIC, 0) == 0, timeout_sec=10, err_msg="Timed out waiting to reach zero replica lags.") self.stop_producer() self.start_producer(max_messages=10000, acks=0, timeout=5) # Verify that there is lag in replicas and is correctly reported by ReplicaVerificationTool wait_until( lambda: self.replica_verifier.get_lag_for_partition(TOPIC, 0) > 0, timeout_sec=10, err_msg= "Timed out waiting to reach non-zero number of replica lags.")
class StreamsStaticMembershipTest(Test): """ Tests using static membership when broker points to minimum supported version (2.3) or higher. """ input_topic = 'inputTopic' pattern = 'PROCESSED' running_message = 'REBALANCING -> RUNNING' stopped_message = 'Static membership test closed' def __init__(self, test_context): super(StreamsStaticMembershipTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 18 }, } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_rolling_bounces_will_not_trigger_rebalance_under_static_membership( self): self.zookeeper.start() self.kafka.start() numThreads = 3 processor1 = StaticMemberTestService(self.test_context, self.kafka, "consumer-A", numThreads) processor2 = StaticMemberTestService(self.test_context, self.kafka, "consumer-B", numThreads) processor3 = StaticMemberTestService(self.test_context, self.kafka, "consumer-C", numThreads) processors = [processor1, processor2, processor3] self.producer.start() for processor in processors: processor.CLEAN_NODE_ENABLED = False self.set_topics(processor) verify_running(processor, self.running_message) self.verify_processing(processors) # do several rolling bounces num_bounces = 3 for i in range(0, num_bounces): for processor in processors: verify_stopped(processor, self.stopped_message) verify_running(processor, self.running_message) stable_generation = -1 for processor in processors: generations = extract_generation_from_logs(processor) num_bounce_generations = num_bounces * numThreads assert num_bounce_generations <= len(generations), \ "Smaller than minimum expected %d generation messages, actual %d" % (num_bounce_generations, len(generations)) for generation in generations[-num_bounce_generations:]: generation = int(generation) if stable_generation == -1: stable_generation = generation assert stable_generation == generation, \ "Stream rolling bounce have caused unexpected generation bump %d" % generation self.verify_processing(processors) stop_processors(processors, self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() def verify_processing(self, processors): for processor in processors: with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( self.pattern, timeout_sec=60, err_msg="Never saw processing of %s " % self.pattern + str(processor.node.account)) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' stopped_message = 'OPTIMIZE_TEST Streams Stopped' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 }, self.reduce_topic: { 'partitions': 6 }, self.join_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] self.logger.info("produce records continually during the test") self.producer.start() self.logger.info("start all processors unoptimized") for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.logger.info("verify unoptimized") self.verify_processing(processors, verify_individual_operations=False) self.logger.info("stop unoptimized") stop_processors(processors, self.stopped_message) self.logger.info("reset") self.reset_application() for processor in processors: processor.node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + ".1", allow_fail=False) processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + ".1", allow_fail=False) processor.node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + ".1", allow_fail=False) processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " + processor.CONFIG_FILE + ".1", allow_fail=False) self.logger.info("start again with topology optimized") for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.logger.info("verify optimized") self.verify_processing(processors, verify_individual_operations=True) self.logger.info("stop optimized") stop_processors(processors, self.stopped_message) self.logger.info("teardown") self.producer.stop() self.kafka.stop() self.zookeeper.stop() def reset_application(self): resetter = StreamsResetter(self.test_context, self.kafka, topic=self.input_topic, applicationId='StreamsOptimizedTest') resetter.start() # resetter is not long-term running but it would be better to check the pid by stopping it resetter.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg= "Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): # This test previously had logic to account for skewed assignments, in which not all processors may # receive active assignments. I don't think this will happen anymore, but keep an eye out if we see # test failures here. If that does resurface, note that the prior implementation was not correct. # A better approach would be to make sure we see processing of each partition across the whole cluster # instead of just expecting to see each node perform some processing. for processor in processors: if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) self.logger.info( list( processor.node.account.ssh_capture("ls -lh %s" % (processor.STDOUT_FILE), allow_fail=True))) wait_until(lambda: processor.node.account.ssh( "grep --max-count 1 '%s' %s" % (pattern, processor.STDOUT_FILE), allow_fail=True) == 0, timeout_sec=60) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic
class StreamsNamedRepartitionTopicTest(Test): """ Tests using a named repartition topic by starting application then doing a rolling upgrade with added operations and the application still runs """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' pattern = 'AGGREGATED' def __init__(self, test_context): super(StreamsNamedRepartitionTopicTest, self).__init__(test_context) self.topics = { self.input_topic: { 'partitions': 6 }, self.aggregation_topic: { 'partitions': 6 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_topology_with_named_repartition_topic(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsNamedRepartitionTopicService( self.test_context, self.kafka) processor2 = StreamsNamedRepartitionTopicService( self.test_context, self.kafka) processor3 = StreamsNamedRepartitionTopicService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] self.producer.start() for processor in processors: processor.CLEAN_NODE_ENABLED = False self.set_topics(processor) self.verify_running(processor, 'REBALANCING -> RUNNING') self.verify_processing(processors) # do rolling upgrade for processor in processors: self.verify_stopped(processor) # will tell app to add operations before repartition topic processor.ADD_ADDITIONAL_OPS = 'true' self.verify_running(processor, 'UPDATED Topology') self.verify_processing(processors) self.stop_processors(processors) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running(processor, message): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until(message, timeout_sec=60, err_msg="Never saw '%s' message " % message + str(processor.node.account)) @staticmethod def verify_stopped(processor): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.stop() monitor.wait_until( 'NAMED_REPARTITION_TEST Streams Stopped', timeout_sec=60, err_msg="'NAMED_REPARTITION_TEST Streams Stopped' message" + str(processor.node.account)) def verify_processing(self, processors): for processor in processors: with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( self.pattern, timeout_sec=60, err_msg="Never saw processing of %s " % self.pattern + str(processor.node.account)) def stop_processors(self, processors): for processor in processors: self.verify_stopped(processor) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic
class ReplicaVerificationToolTest(Test): """ Tests ReplicaVerificationTool """ def __init__(self, test_context): super(ReplicaVerificationToolTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 2 self.messages_received_count = 0 self.topics = { TOPIC: {'partitions': 1, 'replication-factor': 2} } self.zk = ZookeeperService(test_context, self.num_zk) self.kafka = None self.producer = None self.replica_verifier = None def setUp(self): self.zk.start() def start_kafka(self, security_protocol, interbroker_security_protocol): self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=interbroker_security_protocol, topics=self.topics) self.kafka.start() def start_replica_verification_tool(self, security_protocol): self.replica_verifier = ReplicaVerificationTool(self.test_context, 1, self.kafka, TOPIC, report_interval_ms=REPORT_INTERVAL_MS, security_protocol=security_protocol) self.replica_verifier.start() def start_producer(self, max_messages, acks, timeout): # This will produce to kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, acks=acks, max_messages=max_messages) current_acked = self.producer.num_acked self.logger.info("current_acked = %s" % current_acked) self.producer.start() wait_until(lambda: acks == 0 or self.producer.num_acked >= current_acked + max_messages, timeout_sec=timeout, err_msg="Timeout awaiting messages to be produced and acked") def stop_producer(self): self.producer.stop() def test_replica_lags(self, security_protocol='PLAINTEXT'): """ Tests ReplicaVerificationTool :return: None """ self.start_kafka(security_protocol, security_protocol) self.start_replica_verification_tool(security_protocol) self.start_producer(max_messages=10, acks=-1, timeout=15) # Verify that there is no lag in replicas and is correctly reported by ReplicaVerificationTool wait_until(lambda: self.replica_verifier.get_lag_for_partition(TOPIC, 0) == 0, timeout_sec=10, err_msg="Timed out waiting to reach zero replica lags.") self.stop_producer() self.start_producer(max_messages=1000, acks=0, timeout=5) # Verify that there is lag in replicas and is correctly reported by ReplicaVerificationTool wait_until(lambda: self.replica_verifier.get_lag_for_partition(TOPIC, 0) > 0, timeout_sec=10, err_msg="Timed out waiting to reach non-zero number of replica lags.")
class ConsoleConsumerTest(Test): """Sanity checks on console consumer service class.""" def __init__(self, test_context): super(ConsoleConsumerTest, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, zk_chroot="/kafka", topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.consumer = ConsoleConsumer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic) def setUp(self): self.zk.start() @cluster(num_nodes=3) @matrix(security_protocol=['PLAINTEXT', 'SSL']) @cluster(num_nodes=4) @matrix(security_protocol=['SASL_SSL'], sasl_mechanism=['PLAIN', 'SCRAM-SHA-256', 'SCRAM-SHA-512']) @matrix(security_protocol=['SASL_PLAINTEXT', 'SASL_SSL']) def test_lifecycle(self, security_protocol, sasl_mechanism='GSSAPI'): """Check that console consumer starts/stops properly, and that we are capturing log output.""" self.kafka.security_protocol = security_protocol self.kafka.client_sasl_mechanism = sasl_mechanism self.kafka.interbroker_sasl_mechanism = sasl_mechanism self.kafka.start() self.consumer.security_protocol = security_protocol t0 = time.time() self.consumer.start() node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=20, backoff_sec=.2, err_msg="Consumer was too slow to start") self.logger.info("consumer started in %s seconds " % str(time.time() - t0)) # Verify that log output is happening wait_until(lambda: file_exists(node, ConsoleConsumer.LOG_FILE), timeout_sec=10, err_msg="Timed out waiting for consumer log file to exist.") wait_until(lambda: line_count(node, ConsoleConsumer.LOG_FILE) > 0, timeout_sec=1, backoff_sec=.25, err_msg="Timed out waiting for log entries to start.") # Verify no consumed messages assert line_count(node, ConsoleConsumer.STDOUT_CAPTURE) == 0 self.consumer.stop_node(node) @cluster(num_nodes=4) def test_version(self): """Check that console consumer v0.8.2.X successfully starts and consumes messages.""" self.kafka.start() num_messages = 1000 self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=num_messages, throughput=1000) self.producer.start() self.producer.wait() self.consumer.nodes[0].version = LATEST_0_8_2 self.consumer.new_consumer = False self.consumer.consumer_timeout_ms = 1000 self.consumer.start() self.consumer.wait() num_consumed = len(self.consumer.messages_consumed[1]) num_produced = self.producer.num_acked assert num_produced == num_consumed, "num_produced: %d, num_consumed: %d" % (num_produced, num_consumed)
class StreamsOptimizedTest(Test): """ Test doing upgrades of a Kafka Streams application that is un-optimized initially then optimized """ input_topic = 'inputTopic' aggregation_topic = 'aggregationTopic' reduce_topic = 'reduceTopic' join_topic = 'joinTopic' operation_pattern = 'AGGREGATED\|REDUCED\|JOINED' def __init__(self, test_context): super(StreamsOptimizedTest, self).__init__(test_context) self.topics = { self.input_topic: {'partitions': 6}, self.aggregation_topic: {'partitions': 6}, self.reduce_topic: {'partitions': 6}, self.join_topic: {'partitions': 6} } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.input_topic, throughput=1000, acks=1) def test_upgrade_optimized_topology(self): self.zookeeper.start() self.kafka.start() processor1 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka) processor2 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka) processor3 = StreamsOptimizedUpgradeTestService(self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors unoptimized for processor in processors: self.set_topics(processor) processor.CLEAN_NODE_ENABLED = False self.verify_running_repartition_topic_count(processor, 4) self.verify_processing(processors, verify_individual_operations=False) self.stop_processors(processors) # start again with topology optimized for processor in processors: processor.OPTIMIZED_CONFIG = 'all' self.verify_running_repartition_topic_count(processor, 1) self.verify_processing(processors, verify_individual_operations=True) self.stop_processors(processors) self.producer.stop() self.kafka.stop() self.zookeeper.stop() @staticmethod def verify_running_repartition_topic_count(processor, repartition_topic_count): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until('REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' % repartition_topic_count, timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING with REPARTITION TOPIC COUNT=%s' message " % repartition_topic_count + str(processor.node.account)) @staticmethod def verify_stopped(processor): node = processor.node with node.account.monitor_log(processor.STDOUT_FILE) as monitor: processor.stop() monitor.wait_until('OPTIMIZE_TEST Streams Stopped', timeout_sec=60, err_msg="'OPTIMIZE_TEST Streams Stopped' message" + str(processor.node.account)) def verify_processing(self, processors, verify_individual_operations): for processor in processors: if not self.all_source_subtopology_tasks(processor): if verify_individual_operations: for operation in self.operation_pattern.split('\|'): self.do_verify(processor, operation) else: self.do_verify(processor, self.operation_pattern) else: self.logger.info("Skipping processor %s with all source tasks" % processor.node.account) def do_verify(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor: monitor.wait_until(pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def all_source_subtopology_tasks(self, processor): retries = 0 while retries < 5: found = list(processor.node.account.ssh_capture("sed -n 's/.*current active tasks: \[\(\(0_[0-9], \)\{3\}0_[0-9]\)\].*/\1/p' %s" % processor.LOG_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found) if len(found) > 0: return True retries += 1 time.sleep(1) return False def stop_processors(self, processors): for processor in processors: self.verify_stopped(processor) def set_topics(self, processor): processor.INPUT_TOPIC = self.input_topic processor.AGGREGATION_TOPIC = self.aggregation_topic processor.REDUCE_TOPIC = self.reduce_topic processor.JOIN_TOPIC = self.join_topic
class TestBounce(Test): """Sanity checks on verifiable producer service class with cluster roll.""" def __init__(self, test_context): super(TestBounce, self).__init__(test_context) quorum_size_arg_name = 'quorum_size' default_quorum_size = 1 quorum_size = default_quorum_size if not test_context.injected_args else test_context.injected_args.get( quorum_size_arg_name, default_quorum_size) if quorum_size < 1: raise Exception("Illegal %s value provided for the test: %s" % (quorum_size_arg_name, quorum_size)) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=quorum_size) if quorum.for_test( test_context) == quorum.zk else None num_kafka_nodes = quorum_size if quorum.for_test( test_context) == quorum.colocated_kraft else 1 self.kafka = KafkaService( test_context, num_nodes=num_kafka_nodes, zk=self.zk, topics={self.topic: { "partitions": 1, "replication-factor": 1 }}, controller_num_nodes_override=quorum_size) self.num_messages = 1000 def create_producer(self): # This will produce to source kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=self.num_messages // 10) def setUp(self): if self.zk: self.zk.start() # ZooKeeper and KRaft, quorum size = 1 @cluster(num_nodes=4) @matrix(metadata_quorum=quorum.all, quorum_size=[1]) # Remote and Co-located KRaft, quorum size = 3 @cluster(num_nodes=6) @matrix(metadata_quorum=quorum.all_kraft, quorum_size=[3]) def test_simple_run(self, metadata_quorum, quorum_size): """ Test that we can start VerifiableProducer on the current branch snapshot version, and verify that we can produce a small number of messages both before and after a subsequent roll. """ self.kafka.start() for first_time in [True, False]: self.create_producer() self.producer.start() wait_until( lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg= "Producer failed to start in a reasonable amount of time.") self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % ( num_produced, self.num_messages) if first_time: self.producer.stop() if self.kafka.quorum_info.using_kraft and self.kafka.remote_controller_quorum: self.kafka.remote_controller_quorum.restart_cluster() self.kafka.restart_cluster()
class ConsoleConsumerTest(Test): """Sanity checks on console consumer service class.""" def __init__(self, test_context): super(ConsoleConsumerTest, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics={self.topic: {"partitions": 1, "replication-factor": 1}}) self.consumer = ConsoleConsumer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, new_consumer=False) def setUp(self): self.zk.start() @cluster(num_nodes=3) @parametrize(security_protocol='PLAINTEXT', new_consumer=False) @matrix(security_protocol=['PLAINTEXT', 'SSL']) @cluster(num_nodes=4) @matrix(security_protocol=['SASL_SSL'], sasl_mechanism=['PLAIN', 'SCRAM-SHA-256', 'SCRAM-SHA-512']) @matrix(security_protocol=['SASL_PLAINTEXT', 'SASL_SSL']) def test_lifecycle(self, security_protocol, new_consumer=True, sasl_mechanism='GSSAPI'): """Check that console consumer starts/stops properly, and that we are capturing log output.""" self.kafka.security_protocol = security_protocol self.kafka.client_sasl_mechanism = sasl_mechanism self.kafka.interbroker_sasl_mechanism = sasl_mechanism self.kafka.start() self.consumer.security_protocol = security_protocol self.consumer.new_consumer = new_consumer t0 = time.time() self.consumer.start() node = self.consumer.nodes[0] wait_until(lambda: self.consumer.alive(node), timeout_sec=10, backoff_sec=.2, err_msg="Consumer was too slow to start") self.logger.info("consumer started in %s seconds " % str(time.time() - t0)) # Verify that log output is happening wait_until(lambda: file_exists(node, ConsoleConsumer.LOG_FILE), timeout_sec=10, err_msg="Timed out waiting for consumer log file to exist.") wait_until(lambda: line_count(node, ConsoleConsumer.LOG_FILE) > 0, timeout_sec=1, backoff_sec=.25, err_msg="Timed out waiting for log entries to start.") # Verify no consumed messages assert line_count(node, ConsoleConsumer.STDOUT_CAPTURE) == 0 self.consumer.stop_node(node) @cluster(num_nodes=4) def test_version(self): """Check that console consumer v0.8.2.X successfully starts and consumes messages.""" self.kafka.start() num_messages = 1000 self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=num_messages, throughput=1000) self.producer.start() self.producer.wait() self.consumer.nodes[0].version = LATEST_0_8_2 self.consumer.consumer_timeout_ms = 1000 self.consumer.start() self.consumer.wait() num_consumed = len(self.consumer.messages_consumed[1]) num_produced = self.producer.num_acked assert num_produced == num_consumed, "num_produced: %d, num_consumed: %d" % (num_produced, num_consumed)
class SimpleConsumerShellTest(Test): """ Tests SimpleConsumerShell tool """ def __init__(self, test_context): super(SimpleConsumerShellTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.messages_received_count = 0 self.topics = { TOPIC: { 'partitions': NUM_PARTITIONS, 'replication-factor': REPLICATION_FACTOR } } self.zk = ZookeeperService(test_context, self.num_zk) def setUp(self): self.zk.start() def start_kafka(self): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) self.kafka.start() def run_producer(self): # This will produce to kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=TOPIC, throughput=1000, max_messages=MAX_MESSAGES) self.producer.start() wait_until( lambda: self.producer.num_acked == MAX_MESSAGES, timeout_sec=10, err_msg="Timeout awaiting messages to be produced and acked") def start_simple_consumer_shell(self): self.simple_consumer_shell = SimpleConsumerShell( self.test_context, 1, self.kafka, TOPIC) self.simple_consumer_shell.start() @cluster(num_nodes=4) def test_simple_consumer_shell(self): """ Tests if SimpleConsumerShell is fetching expected records :return: None """ self.start_kafka() self.run_producer() self.start_simple_consumer_shell() # Assert that SimpleConsumerShell is fetching expected number of messages wait_until( lambda: self.simple_consumer_shell.get_output().count("\n") == (MAX_MESSAGES + 1), timeout_sec=10, err_msg="Timed out waiting to receive expected number of messages." )
class TestBounce(Test): """Sanity checks on verifiable producer service class with cluster roll.""" def __init__(self, test_context): super(TestBounce, self).__init__(test_context) self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) if quorum.for_test( test_context) == quorum.zk else None self.kafka = KafkaService( test_context, num_nodes=1, zk=self.zk, topics={self.topic: { "partitions": 1, "replication-factor": 1 }}, controller_num_nodes_override=3 if quorum.for_test(test_context) == quorum.remote_raft else 1) self.num_messages = 1000 def create_producer(self): # This will produce to source kafka cluster self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=self.num_messages, throughput=self.num_messages // 10) def setUp(self): if self.zk: self.zk.start() @cluster(num_nodes=6) @parametrize(metadata_quorum=quorum.remote_raft) @cluster(num_nodes=4) @parametrize(metadata_quorum=quorum.colocated_raft) @cluster(num_nodes=4) @parametrize(metadata_quorum=quorum.zk) def test_simple_run(self, metadata_quorum): """ Test that we can start VerifiableProducer on the current branch snapshot version, and verify that we can produce a small number of messages both before and after a subsequent roll. """ self.kafka.start() for first_time in [True, False]: self.create_producer() self.producer.start() wait_until( lambda: self.producer.num_acked > 5, timeout_sec=15, err_msg= "Producer failed to start in a reasonable amount of time.") self.producer.wait() num_produced = self.producer.num_acked assert num_produced == self.num_messages, "num_produced: %d, num_messages: %d" % ( num_produced, self.num_messages) if first_time: self.producer.stop() if self.kafka.quorum_info.using_raft and self.kafka.remote_controller_quorum: self.kafka.remote_controller_quorum.restart_cluster() self.kafka.restart_cluster()
class ReplicationTest(Test): """Replication tests. These tests verify that replication provides simple durability guarantees by checking that data acked by brokers is still available for consumption in the face of various failure scenarios.""" def __init__(self, test_context): """:type test_context: ducktape.tests.test.TestContext""" super(ReplicationTest, self).__init__(test_context=test_context) self.topic = "test_topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(test_context, num_nodes=3, zk=self.zk, topics={ self.topic: { "partitions": 3, "replication-factor": 3, "min.insync.replicas": 2 } }) self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def setUp(self): self.zk.start() self.kafka.start() def min_cluster_size(self): """Override this since we're adding services outside of the constructor""" return super( ReplicationTest, self).min_cluster_size() + self.num_producers + self.num_consumers def run_with_failure(self, failure): """This is the top-level test template. The steps are: Produce messages in the background while driving some failure condition When done driving failures, immediately stop producing Consume all messages Validate that messages acked by brokers were consumed Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose ordering guarantees. Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked, we might exit early if some messages are duplicated (though not an issue here since producer retries==0) Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively consumed messages. Since we run the producer to completion before running the consumer, this is a reliable indicator that nothing is left to consume. """ self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=3000) # Produce in a background thread while driving broker failures self.producer.start() if not wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5): raise RuntimeError( "Producer failed to start in a reasonable amount of time.") failure() self.producer.stop() self.acked = self.producer.acked self.not_acked = self.producer.not_acked self.logger.info("num not acked: %d" % self.producer.num_not_acked) self.logger.info("num acked: %d" % self.producer.num_acked) # Consume all messages self.consumer.start() self.consumer.wait() self.consumed = self.consumer.messages_consumed[1] self.logger.info("num consumed: %d" % len(self.consumed)) # Check produced vs consumed success, msg = self.validate() if not success: self.mark_for_collect(self.producer) assert success, msg def clean_shutdown(self): """Discover leader node for our topic and shut it down cleanly.""" self.kafka.signal_leader(self.topic, partition=0, sig=signal.SIGTERM) def hard_shutdown(self): """Discover leader node for our topic and shut it down with a hard kill.""" self.kafka.signal_leader(self.topic, partition=0, sig=signal.SIGKILL) def clean_bounce(self): """Chase the leader of one partition and restart it cleanly.""" for i in range(5): prev_leader_node = self.kafka.leader(topic=self.topic, partition=0) self.kafka.restart_node(prev_leader_node, wait_sec=5, clean_shutdown=True) def hard_bounce(self): """Chase the leader and restart it cleanly.""" for i in range(5): prev_leader_node = self.kafka.leader(topic=self.topic, partition=0) self.kafka.restart_node(prev_leader_node, wait_sec=5, clean_shutdown=False) # Wait long enough for previous leader to probably be awake again time.sleep(6) def validate(self): """Check that produced messages were consumed.""" success = True msg = "" if len(set(self.consumed)) != len(self.consumed): # There are duplicates. This is ok, so report it but don't fail the test msg += "There are duplicate messages in the log\n" if not set(self.consumed).issuperset(set(self.acked)): # Every acked message must appear in the logs. I.e. consumed messages must be superset of acked messages. acked_minus_consumed = set(self.producer.acked) - set( self.consumed) success = False msg += "At least one acked message did not appear in the consumed messages. acked_minus_consumed: " + str( acked_minus_consumed) if not success: # Collect all the data logs if there was a failure self.mark_for_collect(self.kafka) return success, msg def test_clean_shutdown(self): self.run_with_failure(self.clean_shutdown) def test_hard_shutdown(self): self.run_with_failure(self.hard_shutdown) def test_clean_bounce(self): self.run_with_failure(self.clean_bounce) def test_hard_bounce(self): self.run_with_failure(self.hard_bounce)
class StreamsCooperativeRebalanceUpgradeTest(Test): """ Test of a rolling upgrade from eager rebalance to cooperative rebalance """ source_topic = "source" sink_topic = "sink" task_delimiter = "#" report_interval = "1000" processing_message = "Processed [0-9]* records so far" stopped_message = "COOPERATIVE-REBALANCE-TEST-CLIENT-CLOSED" running_state_msg = "STREAMS in a RUNNING State" cooperative_turned_off_msg = "Eager rebalancing enabled now for upgrade from %s" cooperative_enabled_msg = "Cooperative rebalancing enabled now" first_bounce_phase = "first_bounce_phase-" second_bounce_phase = "second_bounce_phase-" # !!CAUTION!!: THIS LIST OF VERSIONS IS FIXED, NO VERSIONS MUST BE ADDED streams_eager_rebalance_upgrade_versions = [ str(LATEST_0_10_0), str(LATEST_0_10_1), str(LATEST_0_10_2), str(LATEST_0_11_0), str(LATEST_1_0), str(LATEST_1_1), str(LATEST_2_0), str(LATEST_2_1), str(LATEST_2_2), str(LATEST_2_3) ] def __init__(self, test_context): super(StreamsCooperativeRebalanceUpgradeTest, self).__init__(test_context) self.topics = { self.source_topic: { 'partitions': 9 }, self.sink_topic: { 'partitions': 9 } } self.zookeeper = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zookeeper, topics=self.topics) self.producer = VerifiableProducer(self.test_context, 1, self.kafka, self.source_topic, throughput=1000, acks=1) @matrix(upgrade_from_version=streams_eager_rebalance_upgrade_versions) def test_upgrade_to_cooperative_rebalance(self, upgrade_from_version): self.zookeeper.start() self.kafka.start() processor1 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processor2 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processor3 = CooperativeRebalanceUpgradeService( self.test_context, self.kafka) processors = [processor1, processor2, processor3] # produce records continually during the test self.producer.start() # start all processors without upgrade_from config; normal operations mode self.logger.info("Starting all streams clients in normal running mode") for processor in processors: processor.set_version(upgrade_from_version) self.set_props(processor) processor.CLEAN_NODE_ENABLED = False # can't use state as older version don't have state listener # so just verify up and running verify_running(processor, self.processing_message) # all running rebalancing has ceased for processor in processors: self.verify_processing(processor, self.processing_message) # first rolling bounce with "upgrade.from" config set previous_phase = "" self.maybe_upgrade_rolling_bounce_and_verify(processors, previous_phase, self.first_bounce_phase, upgrade_from_version) # All nodes processing, rebalancing has ceased for processor in processors: self.verify_processing( processor, self.first_bounce_phase + self.processing_message) # second rolling bounce without "upgrade.from" config self.maybe_upgrade_rolling_bounce_and_verify(processors, self.first_bounce_phase, self.second_bounce_phase) # All nodes processing, rebalancing has ceased for processor in processors: self.verify_processing( processor, self.second_bounce_phase + self.processing_message) # now verify tasks are unique for processor in processors: self.get_tasks_for_processor(processor) self.logger.info("Active tasks %s" % processor.active_tasks) overlapping_tasks = processor1.active_tasks.intersection( processor2.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor2.active_tasks) overlapping_tasks = processor1.active_tasks.intersection( processor3.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor1.active_tasks, processor3.active_tasks) overlapping_tasks = processor2.active_tasks.intersection( processor3.active_tasks) assert len(overlapping_tasks) == int(0), \ "Final task assignments are not unique %s %s" % (processor2.active_tasks, processor3.active_tasks) # test done close all down stop_processors(processors, self.second_bounce_phase + self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop() def maybe_upgrade_rolling_bounce_and_verify(self, processors, previous_phase, current_phase, upgrade_from_version=None): for processor in processors: # stop the processor in prep for setting "update.from" or removing "update.from" verify_stopped(processor, previous_phase + self.stopped_message) # upgrade to version with cooperative rebalance processor.set_version("") processor.set_upgrade_phase(current_phase) if upgrade_from_version is not None: # need to remove minor version numbers for check of valid upgrade from numbers upgrade_version = upgrade_from_version[:upgrade_from_version. rfind('.')] rebalance_mode_msg = self.cooperative_turned_off_msg % upgrade_version else: upgrade_version = None rebalance_mode_msg = self.cooperative_enabled_msg self.set_props(processor, upgrade_version) node = processor.node with node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.start() # verify correct rebalance mode either turned off for upgrade or enabled after upgrade log_monitor.wait_until( rebalance_mode_msg, timeout_sec=60, err_msg="Never saw '%s' message " % rebalance_mode_msg + str(processor.node.account)) # verify rebalanced into a running state rebalance_msg = current_phase + self.running_state_msg stdout_monitor.wait_until( rebalance_msg, timeout_sec=60, err_msg="Never saw '%s' message " % rebalance_msg + str(processor.node.account)) # verify processing verify_processing_msg = current_phase + self.processing_message stdout_monitor.wait_until( verify_processing_msg, timeout_sec=60, err_msg="Never saw '%s' message " % verify_processing_msg + str(processor.node.account)) def verify_processing(self, processor, pattern): self.logger.info("Verifying %s processing pattern in STDOUT_FILE" % pattern) with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: monitor.wait_until( pattern, timeout_sec=60, err_msg="Never saw processing of %s " % pattern + str(processor.node.account)) def get_tasks_for_processor(self, processor): retries = 0 while retries < 5: found_tasks = list( processor.node.account.ssh_capture( "grep TASK-ASSIGNMENTS %s | tail -n 1" % processor.STDOUT_FILE, allow_fail=True)) self.logger.info("Returned %s from assigned task check" % found_tasks) if len(found_tasks) > 0: task_string = str(found_tasks[0]).strip() self.logger.info("Converted %s from assigned task check" % task_string) processor.set_tasks(task_string) return retries += 1 time.sleep(1) return def set_props(self, processor, upgrade_from=None): processor.SOURCE_TOPIC = self.source_topic processor.SINK_TOPIC = self.sink_topic processor.REPORT_INTERVAL = self.report_interval processor.UPGRADE_FROM = upgrade_from
class ClientCompatibilityTest(Test): def __init__(self, test_context): super(ClientCompatibilityTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def test_producer_back_compatibility(self): """Run 0.9.X java producer against 0.8.X brokers. This test documents the fact that java producer v0.9.0.0 and later won't run against 0.8.X brokers the broker responds to a V1 produce request with a V0 fetch response; the client then tries to parse this V0 produce response as a V1 produce response, resulting in a BufferUnderflowException """ self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, max_messages=100, throughput=self.producer_throughput, version=TRUNK) node = self.producer.nodes[0] try: self.producer.start() self.producer.wait() raise Exception( "0.9.X java producer should not run successfully against 0.8.X broker" ) except: # Expected pass finally: self.producer.kill_node(node, clean_shutdown=False) self.logger.info("Grepping producer log for expected error type") node.account.ssh("egrep -m 1 %s %s" % ( "\"org\.apache\.kafka\.common\.protocol\.types\.SchemaException.*throttle_time_ms.*: java\.nio\.BufferUnderflowException\"", self.producer.LOG_FILE), allow_fail=False) def test_consumer_back_compatibility(self): """Run the scala 0.8.X consumer against an 0.9.X cluster. Expect 0.8.X scala consumer to fail with buffer underflow. This error is the same as when an 0.9.X producer is run against an 0.8.X broker: the broker responds to a V1 fetch request with a V0 fetch response; the client then tries to parse this V0 fetch response as a V1 fetch response, resulting in a BufferUnderflowException """ num_messages = 10 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, max_messages=num_messages, throughput=self.producer_throughput, version=LATEST_0_8_2) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-09X", consumer_timeout_ms=10000, message_validator=is_int, version=TRUNK) self.old_consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-08X", consumer_timeout_ms=10000, message_validator=is_int, version=LATEST_0_8_2) self.producer.run() self.consumer.run() self.old_consumer.run() consumed = len(self.consumer.messages_consumed[1]) old_consumed = len(self.old_consumer.messages_consumed[1]) assert old_consumed == num_messages, "Expected 0.8.X scala consumer to consume %d, but only got %d" % ( num_messages, old_consumed) assert consumed == 0, "Expected 0.9.X scala consumer to fail to consume any messages, but got %d" % consumed self.logger.info("Grepping consumer log for expected error type") node = self.consumer.nodes[0] node.account.ssh("egrep -m 1 %s %s" % ("\"java\.nio\.BufferUnderflowException\"", self.consumer.LOG_FILE), allow_fail=False)
class DelegationTokenTest(Test): def __init__(self, test_context): super(DelegationTokenTest, self).__init__(test_context) self.test_context = test_context self.topic = "topic" self.zk = ZookeeperService(test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, zk_chroot="/kafka", topics={self.topic: {"partitions": 1, "replication-factor": 1}}, server_prop_overides=[ [config_property.DELEGATION_TOKEN_MAX_LIFETIME_MS, "604800000"], [config_property.DELEGATION_TOKEN_EXPIRY_TIME_MS, "86400000"], [config_property.DELEGATION_TOKEN_MASTER_KEY, "test12345"], [config_property.SASL_ENABLED_MECHANISMS, "GSSAPI,SCRAM-SHA-256"] ]) self.jaas_deleg_conf_path = "/tmp/jaas_deleg.conf" self.jaas_deleg_conf = "" self.client_properties_content = """ security.protocol=SASL_PLAINTEXT sasl.mechanism=SCRAM-SHA-256 sasl.kerberos.service.name=kafka client.id=console-consumer """ self.client_kafka_opts=' -Djava.security.auth.login.config=' + self.jaas_deleg_conf_path self.producer = VerifiableProducer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, max_messages=1, throughput=1, kafka_opts_override=self.client_kafka_opts, client_prop_file_override=self.client_properties_content) self.consumer = ConsoleConsumer(self.test_context, num_nodes=1, kafka=self.kafka, topic=self.topic, kafka_opts_override=self.client_kafka_opts, client_prop_file_override=self.client_properties_content) self.kafka.security_protocol = 'SASL_PLAINTEXT' self.kafka.client_sasl_mechanism = 'GSSAPI,SCRAM-SHA-256' self.kafka.interbroker_sasl_mechanism = 'GSSAPI' def setUp(self): self.zk.start() def tearDown(self): self.producer.nodes[0].account.remove(self.jaas_deleg_conf_path) self.consumer.nodes[0].account.remove(self.jaas_deleg_conf_path) def generate_delegation_token(self): self.logger.debug("Request delegation token") self.delegation_tokens.generate_delegation_token() self.jaas_deleg_conf = self.delegation_tokens.create_jaas_conf_with_delegation_token() def expire_delegation_token(self): self.kafka.client_sasl_mechanism = 'GSSAPI,SCRAM-SHA-256' token_hmac = self.delegation_tokens.token_hmac() self.delegation_tokens.expire_delegation_token(token_hmac) def produce_with_delegation_token(self): self.producer.acked_values = [] self.producer.nodes[0].account.create_file(self.jaas_deleg_conf_path, self.jaas_deleg_conf) self.logger.debug(self.jaas_deleg_conf) self.producer.start() def consume_with_delegation_token(self): self.logger.debug("Consume messages with delegation token") self.consumer.nodes[0].account.create_file(self.jaas_deleg_conf_path, self.jaas_deleg_conf) self.logger.debug(self.jaas_deleg_conf) self.consumer.consumer_timeout_ms = 5000 self.consumer.start() self.consumer.wait() def get_datetime_ms(self, input_date): return int(time.mktime(datetime.strptime(input_date,"%Y-%m-%dT%H:%M").timetuple()) * 1000) def renew_delegation_token(self): dt = self.delegation_tokens.parse_delegation_token_out() orig_expiry_date_ms = self.get_datetime_ms(dt["expirydate"]) new_expirydate_ms = orig_expiry_date_ms + 1000 self.delegation_tokens.renew_delegation_token(dt["hmac"], new_expirydate_ms) def test_delegation_token_lifecycle(self): self.kafka.start() self.delegation_tokens = DelegationTokens(self.kafka, self.test_context) self.generate_delegation_token() self.renew_delegation_token() self.produce_with_delegation_token() wait_until(lambda: self.producer.num_acked > 0, timeout_sec=30, err_msg="Expected producer to still be producing.") assert 1 == self.producer.num_acked, "number of acked messages: %d" % self.producer.num_acked self.consume_with_delegation_token() num_consumed = len(self.consumer.messages_consumed[1]) assert 1 == num_consumed, "number of consumed messages: %d" % num_consumed self.expire_delegation_token() self.produce_with_delegation_token() assert 0 == self.producer.num_acked, "number of acked messages: %d" % self.producer.num_acked
class ClientCompatibilityTest(Test): def __init__(self, test_context): super(ClientCompatibilityTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def test_producer_back_compatibility(self): """Run 0.9.X java producer against 0.8.X brokers. This test documents the fact that java producer v0.9.0.0 and later won't run against 0.8.X brokers the broker responds to a V1 produce request with a V0 fetch response; the client then tries to parse this V0 produce response as a V1 produce response, resulting in a BufferUnderflowException """ self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, max_messages=100, throughput=self.producer_throughput, version=TRUNK) node = self.producer.nodes[0] try: self.producer.start() self.producer.wait() raise Exception("0.9.X java producer should not run successfully against 0.8.X broker") except: # Expected pass finally: self.producer.kill_node(node, clean_shutdown=False) self.logger.info("Grepping producer log for expected error type") node.account.ssh("egrep -m 1 %s %s" % ("\"org\.apache\.kafka\.common\.protocol\.types\.SchemaException.*throttle_time_ms.*: java\.nio\.BufferUnderflowException\"", self.producer.LOG_FILE), allow_fail=False) def test_consumer_back_compatibility(self): """Run the scala 0.8.X consumer against an 0.9.X cluster. Expect 0.8.X scala consumer to fail with buffer underflow. This error is the same as when an 0.9.X producer is run against an 0.8.X broker: the broker responds to a V1 fetch request with a V0 fetch response; the client then tries to parse this V0 fetch response as a V1 fetch response, resulting in a BufferUnderflowException """ num_messages = 10 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, max_messages=num_messages, throughput=self.producer_throughput, version=LATEST_0_8_2) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-09X", consumer_timeout_ms=10000, message_validator=is_int, version=TRUNK) self.old_consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-08X", consumer_timeout_ms=10000, message_validator=is_int, version=LATEST_0_8_2) self.producer.run() self.consumer.run() self.old_consumer.run() consumed = len(self.consumer.messages_consumed[1]) old_consumed = len(self.old_consumer.messages_consumed[1]) assert old_consumed == num_messages, "Expected 0.8.X scala consumer to consume %d, but only got %d" % (num_messages, old_consumed) assert consumed == 0, "Expected 0.9.X scala consumer to fail to consume any messages, but got %d" % consumed self.logger.info("Grepping consumer log for expected error type") node = self.consumer.nodes[0] node.account.ssh("egrep -m 1 %s %s" % ("\"java\.nio\.BufferUnderflowException\"", self.consumer.LOG_FILE), allow_fail=False)