def test_rolling_bounces_will_not_trigger_rebalance_under_static_membership( self): self.zookeeper.start() self.kafka.start() numThreads = 3 processor1 = StaticMemberTestService(self.test_context, self.kafka, "consumer-A", numThreads) processor2 = StaticMemberTestService(self.test_context, self.kafka, "consumer-B", numThreads) processor3 = StaticMemberTestService(self.test_context, self.kafka, "consumer-C", numThreads) processors = [processor1, processor2, processor3] self.producer.start() for processor in processors: processor.CLEAN_NODE_ENABLED = False self.set_topics(processor) verify_running(processor, self.running_message) self.verify_processing(processors) # do several rolling bounces num_bounces = 3 for i in range(0, num_bounces): for processor in processors: verify_stopped(processor, self.stopped_message) verify_running(processor, self.running_message) stable_generation = -1 for processor in processors: generations = extract_generation_from_logs(processor) num_bounce_generations = num_bounces * numThreads assert num_bounce_generations <= len(generations), \ "Smaller than minimum expected %d generation messages, actual %d" % (num_bounce_generations, len(generations)) for generation in generations[-num_bounce_generations:]: generation = int(generation) if stable_generation == -1: stable_generation = generation assert stable_generation == generation, \ "Stream rolling bounce have caused unexpected generation bump %d" % generation self.verify_processing(processors) stop_processors(processors, self.stopped_message) self.producer.stop() self.kafka.stop() self.zookeeper.stop()
def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log( first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log( second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) self.leader_counter[processor] = 0 with node.account.monitor_log( processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) # checking for the dev version which should be the only SNAPSHOT log_monitor.wait_until( "Kafka version.*" + self.base_version_number + ".*SNAPSHOT", timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until( "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg= "Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if processor == self.leader: leader_monitor = log_monitor elif first_other_processor == self.leader: leader_monitor = first_other_monitor elif second_other_processor == self.leader: leader_monitor = second_other_monitor else: raise Exception("Could not identify leader.") monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor leader_monitor.wait_until( "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).", timeout_sec=60, err_msg= "Could not detect 'version probing' attempt at leader " + str(self.leader.node.account)) if len(self.old_processors) > 0: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing' at upgrading node " + str(node.account)) else: log_monitor.wait_until( "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.", timeout_sec=60, err_msg= "Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account)) first_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account)) second_other_monitor.wait_until( "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.", timeout_sec=60, err_msg= "Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account)) log_monitor.wait_until( "Version probing detected. Triggering new rebalance.", timeout_sec=60, err_msg= "Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account)) # version probing should trigger second rebalance # now we check that after consecutive rebalances we have synchronized generation generation_synchronized = False retries = 0 while retries < 10: processor_found = extract_generation_from_logs( processor) first_other_processor_found = extract_generation_from_logs( first_other_processor) second_other_processor_found = extract_generation_from_logs( second_other_processor) if len(processor_found) > 0 and len( first_other_processor_found) > 0 and len( second_other_processor_found) > 0: self.logger.info("processor: " + str(processor_found)) self.logger.info("first other processor: " + str(first_other_processor_found)) self.logger.info("second other processor: " + str(second_other_processor_found)) processor_generation = self.extract_highest_generation( processor_found) first_other_processor_generation = self.extract_highest_generation( first_other_processor_found) second_other_processor_generation = self.extract_highest_generation( second_other_processor_found) if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation: current_generation = processor_generation generation_synchronized = True break time.sleep(5) retries = retries + 1 if generation_synchronized == False: raise Exception( "Never saw all three processors have the synchronized generation number" ) if processor == self.leader: self.update_leader() else: self.leader_counter[ self.leader] = self.leader_counter[self.leader] + 1 if self.leader in self.old_processors or len( self.old_processors) > 0: self.verify_metadata_no_upgraded_yet() return current_generation
def do_rolling_bounce(self, processor, counter, current_generation): first_other_processor = None second_other_processor = None for p in self.processors: if p != processor: if first_other_processor is None: first_other_processor = p else: second_other_processor = p node = processor.node first_other_node = first_other_processor.node second_other_node = second_other_processor.node with first_other_node.account.monitor_log(first_other_processor.LOG_FILE) as first_other_monitor: with second_other_node.account.monitor_log(second_other_processor.LOG_FILE) as second_other_monitor: # stop processor processor.stop() node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False) node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False) node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False) with node.account.monitor_log(processor.LOG_FILE) as log_monitor: processor.set_upgrade_to("future_version") processor.start() self.old_processors.remove(processor) self.upgraded_processors.append(processor) # checking for the dev version log_monitor.wait_until("Kafka version.*" + self.base_version_number + ".*", timeout_sec=60, err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account)) log_monitor.offset = 5 log_monitor.wait_until("partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]", timeout_sec=60, err_msg="Could not detect FutureStreamsPartitionAssignor in " + str(node.account)) monitors = {} monitors[processor] = log_monitor monitors[first_other_processor] = first_other_monitor monitors[second_other_processor] = second_other_monitor end_of_upgrade_message = "Sent a version 9 subscription and group.s latest commonly supported version is 10 (successful version probing and end of rolling upgrade). Upgrading subscription metadata version to 10 for next rebalance." end_of_upgrade_error_message = "Could not detect 'successful version probing and end of rolling upgrade' at upgraded node " followup_rebalance_message = "Triggering the followup rebalance scheduled for 0 ms." followup_rebalance_error_message = "Could not detect 'Triggering followup rebalance' at node " if len(self.old_processors) > 0: log_monitor.wait_until("Sent a version 10 subscription and got version 9 assignment back (successful version probing). Downgrade subscription metadata to commonly supported version 9 and trigger new rebalance.", timeout_sec=60, err_msg="Could not detect 'successful version probing' at upgrading node " + str(node.account)) log_monitor.wait_until(followup_rebalance_message, timeout_sec=60, err_msg=followup_rebalance_error_message + str(node.account)) else: first_other_monitor.wait_until(end_of_upgrade_message, timeout_sec=60, err_msg=end_of_upgrade_error_message + str(first_other_node.account)) first_other_monitor.wait_until(followup_rebalance_message, timeout_sec=60, err_msg=followup_rebalance_error_message + str(node.account)) second_other_monitor.wait_until(end_of_upgrade_message, timeout_sec=60, err_msg=end_of_upgrade_error_message + str(second_other_node.account)) second_other_monitor.wait_until(followup_rebalance_message, timeout_sec=60, err_msg=followup_rebalance_error_message + str(node.account)) # version probing should trigger second rebalance # now we check that after consecutive rebalances we have synchronized generation generation_synchronized = False retries = 0 while retries < 10: processor_found = extract_generation_from_logs(processor) first_other_processor_found = extract_generation_from_logs(first_other_processor) second_other_processor_found = extract_generation_from_logs(second_other_processor) if len(processor_found) > 0 and len(first_other_processor_found) > 0 and len(second_other_processor_found) > 0: self.logger.info("processor: " + str(processor_found)) self.logger.info("first other processor: " + str(first_other_processor_found)) self.logger.info("second other processor: " + str(second_other_processor_found)) processor_generation = self.extract_highest_generation(processor_found) first_other_processor_generation = self.extract_highest_generation(first_other_processor_found) second_other_processor_generation = self.extract_highest_generation(second_other_processor_found) if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation: current_generation = processor_generation generation_synchronized = True break time.sleep(5) retries = retries + 1 if generation_synchronized == False: raise Exception("Never saw all three processors have the synchronized generation number") if len(self.old_processors) > 0: self.verify_metadata_no_upgraded_yet(end_of_upgrade_message) return current_generation