class StreamsSmokeTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 1 }, 'data' : { 'partitions': 5, 'replication-factor': 1 }, 'min' : { 'partitions': 5, 'replication-factor': 1 }, 'max' : { 'partitions': 5, 'replication-factor': 1 }, 'sum' : { 'partitions': 5, 'replication-factor': 1 }, 'dif' : { 'partitions': 5, 'replication-factor': 1 }, 'cnt' : { 'partitions': 5, 'replication-factor': 1 }, 'avg' : { 'partitions': 5, 'replication-factor': 1 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 1 }, 'tagg' : { 'partitions': 5, 'replication-factor': 1 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) @cluster(num_nodes=9) def test_streams(self): """ Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times. Ensure that all results (stats on values computed by Kafka Streams) are correct. """ self.driver.start() self.processor1.start() self.processor2.start() time.sleep(15) self.processor3.start() self.processor1.stop() time.sleep(15) self.processor4.start() self.driver.wait() self.driver.stop() self.processor2.stop() self.processor3.stop() self.processor4.stop() node = self.driver.node node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsSmokeTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=2, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 1 }, 'data' : { 'partitions': 5, 'replication-factor': 1 }, 'min' : { 'partitions': 5, 'replication-factor': 1 }, 'max' : { 'partitions': 5, 'replication-factor': 1 }, 'sum' : { 'partitions': 5, 'replication-factor': 1 }, 'dif' : { 'partitions': 5, 'replication-factor': 1 }, 'cnt' : { 'partitions': 5, 'replication-factor': 1 }, 'avg' : { 'partitions': 5, 'replication-factor': 1 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 1 }, 'tagg' : { 'partitions': 5, 'replication-factor': 1 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) @cluster(num_nodes=8) def test_streams(self): """ Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times. Ensure that all results (stats on values computed by Kafka Streams) are correct. """ self.driver.start() self.processor1.start() self.processor2.start() time.sleep(15) self.processor3.start() self.processor1.stop() time.sleep(15) self.processor4.start() self.driver.wait() self.driver.stop() self.processor2.stop() self.processor3.stop() self.processor4.stop() node = self.driver.node node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBounceTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 2 }, 'data' : { 'partitions': 5, 'replication-factor': 2 }, 'min' : { 'partitions': 5, 'replication-factor': 2 }, 'max' : { 'partitions': 5, 'replication-factor': 2 }, 'sum' : { 'partitions': 5, 'replication-factor': 2 }, 'dif' : { 'partitions': 5, 'replication-factor': 2 }, 'cnt' : { 'partitions': 5, 'replication-factor': 2 }, 'avg' : { 'partitions': 5, 'replication-factor': 2 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 2 }, 'tagg' : { 'partitions': 5, 'replication-factor': 2 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) @cluster(num_nodes=6) def test_bounce(self): """ Start a smoke test client, then abort (kill -9) and restart it a few times. Ensure that all records are delivered. """ self.driver.start() self.processor1.start() time.sleep(15) self.processor1.abortThenRestart() time.sleep(15) # enable this after we add change log partition replicas #self.kafka.signal_leader("data") #time.sleep(15); self.processor1.abortThenRestart() self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBrokerBounceTest(Test): """ Simple test of Kafka Streams with brokers failing """ def __init__(self, test_context): super(StreamsBrokerBounceTest, self).__init__(test_context) self.replication = 3 self.partitions = 3 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, '__consumer_offsets': { 'partitions': 50, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } } } def fail_broker_type(self, failure_mode, broker_type): # Pick a random topic and bounce it's leader topic_index = randint(0, len(self.topics.keys()) - 1) topic = self.topics.keys()[topic_index] failures[failure_mode](self, topic, broker_type) def fail_many_brokers(self, failure_mode, num_failures): sig = signal.SIGTERM if (failure_mode == "clean_shutdown"): sig = signal.SIGTERM else: sig = signal.SIGKILL for num in range(0, num_failures - 1): signal_node(self, self.kafka.nodes[num], sig) def setup_system(self, start_processor=True): # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics) self.kafka.start() # Start test harness self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() if (start_processor): self.processor1.start() def collect_results(self, sleep_time_secs): data = {} # End test self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node # Success is declared if streams does not crash when sleep time > 0 # It should give an exception when sleep time is 0 since we kill the brokers immediately # and the topic manager cannot create internal topics with the desired replication factor if (sleep_time_secs == 0): output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False) else: output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) for line in output_streams: data["Client closed"] = line # Currently it is hard to guarantee anything about Kafka since we don't have exactly once. # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS output = node.account.ssh_capture( "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Records Delivered"] = line output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Logic Success/Failure"] = line return data @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], broker_type=["leader", "controller"], sleep_time_secs=[120]) def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker and ensure data is still received Record if records are delivered. """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) return self.collect_results(sleep_time_secs) @ignore @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown"], broker_type=["controller"], sleep_time_secs=[0]) def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker immediately before streams stats Streams should throw an exception since it cannot create topics with the desired replication factor of 3 """ self.setup_system(start_processor=False) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) self.processor1.start() return self.collect_results(sleep_time_secs) @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], num_failures=[2]) def test_many_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120) @cluster(num_nodes=7) @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3]) def test_all_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120)
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } self.leader = None self.leader_counter = {} def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh( "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [ self.processor1, self.processor2, self.processor3 ] self.upgraded_processors = [] for p in self.processors: self.leader_counter[p] = 2 self.update_leader() for p in self.processors: self.leader_counter[p] = 0 self.leader_counter[self.leader] = 3 counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce( p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def update_leader(self): self.leader = None retries = 10 while retries > 0: for p in self.processors: found = list( p.node.account.ssh_capture( "grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True)) if len(found) == self.leader_counter[p] + 1: if self.leader is not None: raise Exception("Could not uniquely identify leader") self.leader = p self.leader_counter[p] = self.leader_counter[p] + 1 if self.leader is None: retries = retries - 1 time.sleep(5) else: break if self.leader is None: raise Exception("Could not identify leader") def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account)) def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log( first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) self.leader_counter[processor] = 0 with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) log_monitor.wait_until( "Kafka version : " + str(DEV_VERSION), timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until( "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg= "Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if processor == self.leader: leader_monitor = log_monitor elif first_other_processor == self.leader: leader_monitor = first_other_monitor elif second_other_processor == self.leader: leader_monitor = second_other_monitor else: raise Exception("Could not identify leader.") monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor leader_monitor.wait_until( "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).", timeout_sec=60, err_msg= "Could not detect 'version probing' attempt at leader " + str(self.leader.node.account)) if len(self.old_processors) > 0: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing' at upgrading node " + str(node.account)) else: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account)) first_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account)) second_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account)) log_monitor.wait_until( "Version probing detected. Triggering new rebalance.", timeout_sec=60, err_msg= "Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account)) # version probing should trigger second rebalance # now we check that after consecutive rebalances we have synchronized generation generation_synchronized = False retries = 0 while retries < 10: processor_found = self.extract_generation_from_logs( processor) first_other_processor_found = self.extract_generation_from_logs( first_other_processor) second_other_processor_found = self.extract_generation_from_logs( second_other_processor) if len(processor_found) > 0 and len( first_other_processor_found) > 0 and len( second_other_processor_found) > 0: self.logger.info("processor: " + str(processor_found)) self.logger.info("first other processor: " + str(first_other_processor_found)) self.logger.info("second other processor: " + str(second_other_processor_found)) processor_generation = self.extract_highest_generation( processor_found) first_other_processor_generation = self.extract_highest_generation( first_other_processor_found) second_other_processor_generation = self.extract_highest_generation( second_other_processor_found) if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation: current_generation = processor_generation generation_synchronized = True break time.sleep(5) retries = retries + 1 if generation_synchronized == False: raise Exception( "Never saw all three processors have the synchronized generation number" ) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if self.leader in self.old_processors or len( self.old_processors) > 0: self.verify_metadata_no_upgraded_yet() return current_generation def extract_generation_from_logs(self, processor): return list( processor.node.account.ssh_capture( "grep \"Successfully joined group with generation\" %s| awk \'{for(i=1;i<=NF;i++) {if ($i == \"generation\") beginning=i+1; if($i== \"(org.apache.kafka.clients.consumer.internals.AbstractCoordinator)\") ending=i }; for (j=beginning;j<ending;j++) printf $j; printf \"\\n\"}\'" % processor.LOG_FILE, allow_fail=True)) def extract_highest_generation(self, found_generations): return int(found_generations[-1]) def verify_metadata_no_upgraded_yet(self): for p in self.processors: found = list( p.node.account.ssh_capture( "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'group member upgraded to metadata 4 too early'" )
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 }, } self.leader = None def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(KafkaTest): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context, num_zk=1, num_brokers=1, topics={ 'echo': { 'partitions': 5 }, 'data': { 'partitions': 5 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService( test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService( test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService( test_context, self.kafka) @parametrize(old_version=str(LATEST_0_10_1), new_version=str(LATEST_0_10_2)) @parametrize(old_version=str(LATEST_0_10_1), new_version=str(DEV_VERSION)) @parametrize(old_version=str(LATEST_0_10_2), new_version=str(DEV_VERSION)) def test_simple_upgrade(self, old_version, new_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_verion> """ self.driver.start() self.start_all_nodes_with(old_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "", new_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@parametrize(new_version=str(LATEST_0_10_1)) we cannot run this test until Kafka 0.10.1.2 is released #@parametrize(new_version=str(LATEST_0_10_2)) we cannot run this test until Kafka 0.10.2.2 is released @parametrize(new_version=str(DEV_VERSION)) def test_metadata_upgrade(self, new_version): """ Starts 3 KafkaStreams instances with version 0.10.0, and upgrades one-by-one to <new_version> """ self.driver.start() self.start_all_nodes_with(str(LATEST_0_10_0)) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, "0.10.0", new_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, "", new_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until( "UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log( self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log( self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log( self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log( self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log( self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log( self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until( "Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.stop() first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from == "": # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log( first_other_processor.STDOUT_FILE ) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.STDOUT_FILE ) as second_other_monitor: processor.start() log_monitor.wait_until( "Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list( first_other_node.account.ssh_capture( grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) second_other_monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list( second_other_node.account.ssh_capture( grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception( "Kafka Streams failed with 'unable to decode subscription data: version=2'" ) monitor.wait_until( "processed 100 records from topic", timeout_sec=60, err_msg= "Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(Test): """ Tests rolling upgrades and downgrades of the Kafka Streams library. """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } def perform_streams_upgrade(self, to_version): self.logger.info("First pass bounce - rolling streams upgrade") # get the node running the streams app node = self.processor1.node self.processor1.stop() # change it's version. This will automatically make it pick up a different # JAR when it starts again node.version = KafkaVersion(to_version) self.processor1.start() def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @cluster(num_nodes=6) @parametrize(from_version=str(LATEST_0_10_1), to_version=str(DEV_BRANCH)) @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH)) @parametrize(from_version=str(LATEST_0_10_1), to_version=str(LATEST_0_11_0)) @parametrize(from_version=str(LATEST_0_10_2), to_version=str(LATEST_0_11_0)) @parametrize(from_version=str(LATEST_0_11_0), to_version=str(LATEST_0_10_2)) @parametrize(from_version=str(DEV_BRANCH), to_version=str(LATEST_0_10_2)) def test_upgrade_downgrade_streams(self, from_version, to_version): """ Start a smoke test client, then abort (kill -9) and restart it a few times. Ensure that all records are delivered. Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh (search for get_kafka()). For streams in particular, that means that someone has manually copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh. """ # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_streams_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @cluster(num_nodes=6) @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH)) def test_upgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo' : { 'partitions': 5 }, 'data' : { 'partitions': 5 }, } self.leader = None def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr}}, 'data' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'min' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'max' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @ignore @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @ignore @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_rolling_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_rolling_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_rolling_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.stop() first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.start() log_monitor.wait_until("Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account))
class StreamsUpgradeTest(Test): """ Test upgrading Kafka Streams (all version combination) If metadata was changes, upgrade is more difficult Metadata version was bumped in 0.10.1.0 """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.topics = { 'echo' : { 'partitions': 5 }, 'data' : { 'partitions': 5 }, } self.leader = None self.leader_counter = {} def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @ignore @cluster(num_nodes=6) @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions) def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr}}, 'data' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'min' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'max' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} }, 'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": self.isr} } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions) def test_simple_upgrade_downgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version> """ if from_version == to_version: return self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # upgrade one-by-one via rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions) @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions) @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions) def test_metadata_upgrade(self, from_version, to_version): """ Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version> """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with(from_version) self.processors = [self.processor1, self.processor2, self.processor3] counter = 1 random.seed() # first rolling bounce random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False self.do_stop_start_bounce(p, from_version[:-2], to_version, counter) counter = counter + 1 # second rolling bounce random.shuffle(self.processors) for p in self.processors: self.do_stop_start_bounce(p, None, to_version, counter) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def test_version_probing_upgrade(self): """ Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version" """ self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics) self.kafka.start() self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.driver.disable_auto_terminate() self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() self.start_all_nodes_with("") # run with TRUNK self.processors = [self.processor1, self.processor2, self.processor3] self.old_processors = [self.processor1, self.processor2, self.processor3] self.upgraded_processors = [] for p in self.processors: self.leader_counter[p] = 2 self.update_leader() for p in self.processors: self.leader_counter[p] = 0 self.leader_counter[self.leader] = 3 counter = 1 current_generation = 3 random.seed() random.shuffle(self.processors) for p in self.processors: p.CLEAN_NODE_ENABLED = False current_generation = self.do_rolling_bounce(p, counter, current_generation) counter = counter + 1 # shutdown self.driver.stop() self.driver.wait() random.shuffle(self.processors) for p in self.processors: node = p.node with node.account.monitor_log(p.STDOUT_FILE) as monitor: p.stop() monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED", timeout_sec=60, err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account)) self.driver.stop() def update_leader(self): self.leader = None retries = 10 while retries > 0: for p in self.processors: found = list(p.node.account.ssh_capture("grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True)) if len(found) == self.leader_counter[p] + 1: if self.leader is not None: raise Exception("Could not uniquely identify leader") self.leader = p self.leader_counter[p] = self.leader_counter[p] + 1 if self.leader is None: retries = retries - 1 time.sleep(5) else: break if self.leader is None: raise Exception("Could not identify leader") def start_all_nodes_with(self, version): # start first with <version> self.prepare_for(self.processor1, version) node1 = self.processor1.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor: with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor: self.processor1.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account)) monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) # start second with <version> self.prepare_for(self.processor2, version) node2 = self.processor2.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor: self.processor2.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) # start third with <version> self.prepare_for(self.processor3, version) node3 = self.processor3.node with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor: with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor: with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor: with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor: self.processor3.start() log_monitor.wait_until("Kafka version : " + version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account)) first_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account)) second_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account)) third_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account)) @staticmethod def prepare_for(processor, version): processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False) if version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(version) def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node # stop processor and wait for rebalance of others with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.stop() first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) if upgrade_from is None: # upgrade disabled -- second round of rolling bounces roll_counter = ".1-" # second round of rolling bounces else: roll_counter = ".0-" # first round of rolling boundes node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False) if new_version == str(DEV_VERSION): processor.set_version("") # set to TRUNK else: processor.set_version(new_version) processor.set_upgrade_from(upgrade_from) grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" " with node.account.monitor_log(processor.STDOUT_FILE) as monitor: with node.account.monitor_log(processor.LOG_FILE) as log_monitor: with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor: processor.start() log_monitor.wait_until("Kafka version : " + new_version, timeout_sec=60, err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account)) first_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account)) found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") second_other_monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account)) found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'") monitor.wait_until("processed 100 records from topic", timeout_sec=60, err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account)) def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log(first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) self.leader_counter[processor] = 0 with node.account.monitor_log(processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) current_generation = current_generation + 1 log_monitor.wait_until("Kafka version : " + str(DEV_VERSION), timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until("partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg="Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) log_monitor.wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(node.account)) first_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(first_other_node.account)) second_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(second_other_node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1 if processor == self.leader: leader_monitor = log_monitor elif first_other_processor == self.leader: leader_monitor = first_other_monitor elif second_other_processor == self.leader: leader_monitor = second_other_monitor else: raise Exception("Could not identify leader.") monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor leader_monitor.wait_until("Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).", timeout_sec=60, err_msg="Could not detect 'version probing' attempt at leader " + str(self.leader.node.account)) if len(self.old_processors) > 0: log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.", timeout_sec=60, err_msg="Could not detect 'successful version probing' at upgrading node " + str(node.account)) else: log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.", timeout_sec=60, err_msg="Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account)) first_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account)) second_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account)) log_monitor.wait_until("Version probing detected. Triggering new rebalance.", timeout_sec=60, err_msg="Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account)) # version probing should trigger second rebalance current_generation = current_generation + 1 for p in self.processors: monitors[p].wait_until("Successfully joined group with generation " + str(current_generation), timeout_sec=60, err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(p.node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1 if self.leader in self.old_processors or len(self.old_processors) > 0: self.verify_metadata_no_upgraded_yet() return current_generation def verify_metadata_no_upgraded_yet(self): for p in self.processors: found = list(p.node.account.ssh_capture("grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True)) if len(found) > 0: raise Exception("Kafka Streams failed with 'group member upgraded to metadata 4 too early'")
class StreamsSmokeTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 1 }, 'data' : { 'partitions': 5, 'replication-factor': 1 }, 'min' : { 'partitions': 5, 'replication-factor': 1 }, 'min-suppressed' : { 'partitions': 5, 'replication-factor': 1 }, 'min-raw' : { 'partitions': 5, 'replication-factor': 1 }, 'max' : { 'partitions': 5, 'replication-factor': 1 }, 'sum' : { 'partitions': 5, 'replication-factor': 1 }, 'sws-raw' : { 'partitions': 5, 'replication-factor': 1 }, 'sws-suppressed' : { 'partitions': 5, 'replication-factor': 1 }, 'dif' : { 'partitions': 5, 'replication-factor': 1 }, 'cnt' : { 'partitions': 5, 'replication-factor': 1 }, 'avg' : { 'partitions': 5, 'replication-factor': 1 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 1 }, 'tagg' : { 'partitions': 5, 'replication-factor': 1 } }) self.test_context = test_context self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) @cluster(num_nodes=8) @matrix(processing_guarantee=['at_least_once'], crash=[True, False], metadata_quorum=quorum.all_non_upgrade) @matrix(processing_guarantee=['exactly_once', 'exactly_once_v2'], crash=[True, False]) def test_streams(self, processing_guarantee, crash, metadata_quorum=quorum.zk): processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee) processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee) processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee) with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1: processor1.start() monitor1.wait_until('REBALANCING -> RUNNING', timeout_sec=60, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account) ) self.driver.start() monitor1.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor1.node.account) ) # make sure we're not already done processing (which would invalidate the test) self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False) processor1.stop_nodes(not crash) with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2: processor2.start() monitor2.wait_until('REBALANCING -> RUNNING', timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account) ) monitor2.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor2.node.account) ) # make sure we're not already done processing (which would invalidate the test) self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False) processor2.stop_nodes(not crash) with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3: processor3.start() monitor3.wait_until('REBALANCING -> RUNNING', timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account) ) # there should still be some data left for this processor to work on. monitor3.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor3.node.account) ) self.driver.wait() self.driver.stop() processor3.stop() if crash and processing_guarantee == 'at_least_once': self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) else: self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBrokerBounceTest(Test): """ Simple test of Kafka Streams with brokers failing """ def __init__(self, test_context): super(StreamsBrokerBounceTest, self).__init__(test_context) self.replication = 3 self.partitions = 3 self.topics = { 'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2}}, 'data' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'min' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'max' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, '__consumer_offsets' : { 'partitions': 50, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} } } def fail_broker_type(self, failure_mode, broker_type): # Pick a random topic and bounce it's leader topic_index = randint(0, len(self.topics.keys()) - 1) topic = self.topics.keys()[topic_index] failures[failure_mode](self, topic, broker_type) def fail_many_brokers(self, failure_mode, num_failures): sig = signal.SIGTERM if (failure_mode == "clean_shutdown"): sig = signal.SIGTERM else: sig = signal.SIGKILL for num in range(0, num_failures - 1): signal_node(self, self.kafka.nodes[num], sig) def confirm_topics_on_all_brokers(self, expected_topic_set): for node in self.kafka.nodes: match_count = 0 # need to iterate over topic_list_generator as kafka.list_topics() # returns a python generator so values are fetched lazily # so we can't just compare directly we must iterate over what's returned topic_list_generator = self.kafka.list_topics(node=node) for topic in topic_list_generator: if topic in expected_topic_set: match_count += 1 if len(expected_topic_set) != match_count: return False return True def setup_system(self, start_processor=True): # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") # Start test harness self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() if (start_processor): self.processor1.start() def collect_results(self, sleep_time_secs): data = {} # End test self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node # Success is declared if streams does not crash when sleep time > 0 # It should give an exception when sleep time is 0 since we kill the brokers immediately # and the topic manager cannot create internal topics with the desired replication factor if (sleep_time_secs == 0): output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False) else: output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) for line in output_streams: data["Client closed"] = line # Currently it is hard to guarantee anything about Kafka since we don't have exactly once. # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS output = node.account.ssh_capture("grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Records Delivered"] = line output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Logic Success/Failure"] = line return data @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"], broker_type=["leader", "controller"], sleep_time_secs=[120]) def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker and ensure data is still received Record if records are delivered. """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) return self.collect_results(sleep_time_secs) @ignore @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown"], broker_type=["controller"], sleep_time_secs=[0]) def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker immediately before streams stats Streams should throw an exception since it cannot create topics with the desired replication factor of 3 """ self.setup_system(start_processor=False) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) self.processor1.start() return self.collect_results(sleep_time_secs) @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"], num_failures=[2]) def test_many_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120) @cluster(num_nodes=7) @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3]) def test_all_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120)
class StreamsSmokeTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 1 }, 'data' : { 'partitions': 5, 'replication-factor': 1 }, 'min' : { 'partitions': 5, 'replication-factor': 1 }, 'min-suppressed' : { 'partitions': 5, 'replication-factor': 1 }, 'min-raw' : { 'partitions': 5, 'replication-factor': 1 }, 'max' : { 'partitions': 5, 'replication-factor': 1 }, 'sum' : { 'partitions': 5, 'replication-factor': 1 }, 'sws-raw' : { 'partitions': 5, 'replication-factor': 1 }, 'sws-suppressed' : { 'partitions': 5, 'replication-factor': 1 }, 'dif' : { 'partitions': 5, 'replication-factor': 1 }, 'cnt' : { 'partitions': 5, 'replication-factor': 1 }, 'avg' : { 'partitions': 5, 'replication-factor': 1 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 1 }, 'tagg' : { 'partitions': 5, 'replication-factor': 1 } }) self.test_context = test_context self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) @cluster(num_nodes=8) @matrix(eos=[True, False], crash=[True, False]) def test_streams(self, eos, crash): # if eos: processor1 = StreamsSmokeTestEOSJobRunnerService(self.test_context, self.kafka) processor2 = StreamsSmokeTestEOSJobRunnerService(self.test_context, self.kafka) processor3 = StreamsSmokeTestEOSJobRunnerService(self.test_context, self.kafka) else: processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1: processor1.start() monitor1.wait_until('REBALANCING -> RUNNING', timeout_sec=60, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account) ) self.driver.start() monitor1.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor1.node.account) ) # make sure we're not already done processing (which would invalidate the test) self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False) processor1.stop_nodes(not crash) with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2: processor2.start() monitor2.wait_until('REBALANCING -> RUNNING', timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account) ) monitor2.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor2.node.account) ) # make sure we're not already done processing (which would invalidate the test) self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False) processor2.stop_nodes(not crash) with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3: processor3.start() monitor3.wait_until('REBALANCING -> RUNNING', timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account) ) # there should still be some data left for this processor to work on. monitor3.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor3.node.account) ) self.driver.wait() self.driver.stop() processor3.stop() if crash and not eos: self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) else: self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBrokerBounceTest(Test): """ Simple test of Kafka Streams with brokers failing """ def __init__(self, test_context): super(StreamsBrokerBounceTest, self).__init__(test_context) self.replication = 3 self.partitions = 3 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, '__consumer_offsets': { 'partitions': 50, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } } } def fail_broker_type(self, failure_mode, broker_type): # Pick a random topic and bounce it's leader topic_index = randint(0, len(self.topics.keys()) - 1) topic = self.topics.keys()[topic_index] failures[failure_mode](self, topic, broker_type) def fail_many_brokers(self, failure_mode, num_failures): sig = signal.SIGTERM if (failure_mode == "clean_shutdown"): sig = signal.SIGTERM else: sig = signal.SIGKILL for num in range(0, num_failures - 1): signal_node(self, self.kafka.nodes[num], sig) def confirm_topics_on_all_brokers(self, expected_topic_set): for node in self.kafka.nodes: match_count = 0 # need to iterate over topic_list_generator as kafka.list_topics() # returns a python generator so values are fetched lazily # so we can't just compare directly we must iterate over what's returned topic_list_generator = self.kafka.list_topics(node=node) for topic in topic_list_generator: if topic in expected_topic_set: match_count += 1 if len(expected_topic_set) != match_count: return False return True def setup_system(self, start_processor=True, num_threads=3): # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers( set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") # Start test harness self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, "at_least_once", num_threads) self.driver.start() if (start_processor): self.processor1.start() def collect_results(self, sleep_time_secs): data = {} # End test self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node # Success is declared if streams does not crash when sleep time > 0 # It should give an exception when sleep time is 0 since we kill the brokers immediately # and the topic manager cannot create internal topics with the desired replication factor if (sleep_time_secs == 0): output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False) else: output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) for line in output_streams: data["Client closed"] = line # Currently it is hard to guarantee anything about Kafka since we don't have exactly once. # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS output = node.account.ssh_capture( "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Records Delivered"] = line output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Logic Success/Failure"] = line return data @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], broker_type=["leader", "controller"], num_threads=[1, 3], sleep_time_secs=[120]) def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs, num_threads): """ Start a smoke test client, then kill one particular broker and ensure data is still received Record if records are delivered. We also add a single thread stream client to make sure we could get all partitions reassigned in next generation so to verify the partition lost is correctly triggered. """ self.setup_system(num_threads=num_threads) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) return self.collect_results(sleep_time_secs) @ignore @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown"], broker_type=["controller"], sleep_time_secs=[0]) def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker immediately before streams stats Streams should throw an exception since it cannot create topics with the desired replication factor of 3 """ self.setup_system(start_processor=False) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) self.processor1.start() return self.collect_results(sleep_time_secs) @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], num_failures=[2]) def test_many_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120) @cluster(num_nodes=7) @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3]) def test_all_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ # Set min.insync.replicas to 1 because in the last stage of the test there is only one broker left. # Otherwise the last offset commit will never succeed and time out and potentially take longer as # duration passed to the close method of the Kafka Streams client. self.topics['__consumer_offsets'] = { 'partitions': 50, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 1 } } self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120)