Example #1
0
    def test_upgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker. 
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()
        
        # allow some time for topics to be created
        time.sleep(10)
        
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
            self.processor1.start()
Example #3
0
    def test_upgrade_downgrade_streams(self, from_version, to_version):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.

        Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed
        if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh 
        (search for get_kafka()). For streams in particular, that means that someone has manually
        copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh.
        """
        if from_version != to_version:
            # Setup phase
            self.zk = ZookeeperService(self.test_context, num_nodes=1)
            self.zk.start()

            # number of nodes needs to be >= 3 for the smoke test
            self.kafka = KafkaService(self.test_context,
                                      num_nodes=3,
                                      zk=self.zk,
                                      version=KafkaVersion(from_version),
                                      topics=self.topics)
            self.kafka.start()

            # allow some time for topics to be created
            time.sleep(10)

            self.driver = StreamsSmokeTestDriverService(
                self.test_context, self.kafka)
            self.driver.node.version = KafkaVersion(from_version)
            self.driver.start()

            self.processor1 = StreamsSmokeTestJobRunnerService(
                self.test_context, self.kafka)
            self.processor1.node.version = KafkaVersion(from_version)
            self.processor1.start()

            time.sleep(15)

            self.perform_streams_upgrade(to_version)

            time.sleep(15)
            self.driver.wait()
            self.driver.stop()

            self.processor1.stop()

            self.driver.node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                                         self.driver.STDOUT_FILE,
                                         allow_fail=False)
            self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
Example #4
0
    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context,
                                                num_zk=1,
                                                num_brokers=2,
                                                topics={
                                                    'echo': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'data': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'min': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'max': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'sum': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'dif': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'cnt': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'avg': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'wcnt': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    },
                                                    'tagg': {
                                                        'partitions': 5,
                                                        'replication-factor': 2
                                                    }
                                                })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
    def setup_system(self):
         # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()
        
        self.kafka = KafkaService(self.test_context, num_nodes=self.replication,
                                  zk=self.zk, topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        
        self.driver.start()
        self.processor1.start()
Example #6
0
class StreamsBounceTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=6)
    def test_bounce(self):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.
        """

        self.driver.start()

        self.processor1.start()

        time.sleep(15)

        self.processor1.abortThenRestart()

        time.sleep(15)

        # enable this after we add change log partition replicas
        #self.kafka.signal_leader("data")

        #time.sleep(15);

        self.processor1.abortThenRestart()

        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBounceTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=6)
    def test_bounce(self):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.
        """

        self.driver.start()

        self.processor1.start()

        time.sleep(15)

        self.processor1.abortThenRestart()

        time.sleep(15)

        # enable this after we add change log partition replicas
        #self.kafka.signal_leader("data")

        #time.sleep(15);

        self.processor1.abortThenRestart()

        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
Example #8
0
    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()
Example #10
0
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()
Example #11
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=9)
    def test_streams(self):
        """
        Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times.
        Ensure that all results (stats on values computed by Kafka Streams) are correct.
        """

        self.driver.start()

        self.processor1.start()
        self.processor2.start()

        time.sleep(15)

        self.processor3.start()
        self.processor1.stop()

        time.sleep(15)

        self.processor4.start()

        self.driver.wait()
        self.driver.stop()

        self.processor2.stop()
        self.processor3.stop()
        self.processor4.stop()

        node = self.driver.node
        node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=2, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=8)
    def test_streams(self):
        """
        Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times.
        Ensure that all results (stats on values computed by Kafka Streams) are correct.
        """

        self.driver.start()

        self.processor1.start()
        self.processor2.start()

        time.sleep(15)

        self.processor3.start()
        self.processor1.stop()

        time.sleep(15)

        self.processor4.start()

        self.driver.wait()
        self.driver.stop()

        self.processor2.stop()
        self.processor3.stop()
        self.processor4.stop()

        node = self.driver.node
        node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsUpgradeTest(Test):
    """
    Tests rolling upgrades and downgrades of the Kafka Streams library.
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

    def perform_streams_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling streams upgrade")

        # get the node running the streams app
        node = self.processor1.node
        self.processor1.stop()

        # change it's version. This will automatically make it pick up a different
        # JAR when it starts again
        node.version = KafkaVersion(to_version)
        self.processor1.start()

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @parametrize(from_version=str(LATEST_0_10_1), to_version=str(DEV_BRANCH))
    @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH))
    @parametrize(from_version=str(LATEST_0_10_1),
                 to_version=str(LATEST_0_11_0))
    @parametrize(from_version=str(LATEST_0_10_2),
                 to_version=str(LATEST_0_11_0))
    @parametrize(from_version=str(LATEST_0_11_0),
                 to_version=str(LATEST_0_10_2))
    @parametrize(from_version=str(DEV_BRANCH), to_version=str(LATEST_0_10_2))
    def test_upgrade_downgrade_streams(self, from_version, to_version):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.

        Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed
        if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh 
        (search for get_kafka()). For streams in particular, that means that someone has manually
        copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh.
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_streams_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @cluster(num_nodes=6)
    @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH))
    def test_upgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker. 
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)
Example #14
0
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """

    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            '__consumer_offsets' : { 'partitions': 50, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL
            
        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True

        
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        
        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
            
        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture("grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line
            
        
        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"],
            broker_type=["leader", "controller"],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered. 
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"],
            num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
    def __init__(self, test_context):
        super(StreamsMultipleRollingUpgradeTest,
              self).__init__(test_context,
                             topics={
                                 'echo': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'data': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'min': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'max': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'sum': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'dif': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'cnt': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'avg': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'wcnt': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'tagg': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 }
                             })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor_1 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor_2 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor_3 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)

        # already on trunk version at end of upgrades so get rid of it
        self.streams_downgrade_versions = self.streams_upgrade_versions[:-1]
        self.streams_downgrade_versions.reverse()

        self.processors = [
            self.processor_1, self.processor_2, self.processor_3
        ]

        self.started = False
Example #16
0
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """
    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            '__consumer_offsets': {
                'partitions': 50,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True

    def setup_system(self, start_processor=True, num_threads=3):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka, "at_least_once", num_threads)

        self.driver.start()

        if (start_processor):
            self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node

        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-EXCEPTION %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)

        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" %
                                          self.driver.STDOUT_FILE,
                                          allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line

        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            broker_type=["leader", "controller"],
            num_threads=[1, 3],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type,
                                sleep_time_secs, num_threads):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered.
        We also add a single thread stream client to make sure we could get all partitions reassigned in
        next generation so to verify the partition lost is correctly triggered.
        """
        self.setup_system(num_threads=num_threads)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type,
                                         sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """

        # Set min.insync.replicas to 1 because in the last stage of the test there is only one broker left.
        # Otherwise the last offset commit will never succeed and time out and potentially take longer as
        # duration passed to the close method of the Kafka Streams client.
        self.topics['__consumer_offsets'] = {
            'partitions': 50,
            'replication-factor': self.replication,
            'configs': {
                "min.insync.replicas": 1
            }
        }

        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
Example #17
0
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)
Example #18
0
    def test_streams(self, processing_guarantee, crash, metadata_quorum=quorum.zk):
        processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)
        processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)
        processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)

        with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1:
            processor1.start()
            monitor1.wait_until('REBALANCING -> RUNNING',
                               timeout_sec=60,
                               err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account)
                               )

            self.driver.start()

            monitor1.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor1.node.account)
                                )

            # make sure we're not already done processing (which would invalidate the test)
            self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

            processor1.stop_nodes(not crash)

        with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2:
            processor2.start()
            monitor2.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account)
                                )
            monitor2.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor2.node.account)
                                )

        # make sure we're not already done processing (which would invalidate the test)
        self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

        processor2.stop_nodes(not crash)

        with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3:
            processor3.start()
            monitor3.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account)
                                )
            # there should still be some data left for this processor to work on.
            monitor3.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor3.node.account)
                                )

        self.driver.wait()
        self.driver.stop()

        processor3.stop()

        if crash and processing_guarantee == 'at_least_once':
            self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        else:
            self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
Example #19
0
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.num_kafka_nodes = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.num_kafka_nodes,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)

        processor = StreamsSmokeTestJobRunnerService(self.test_context,
                                                     self.kafka)

        with self.driver.node.account.monitor_log(
                self.driver.STDOUT_FILE) as driver_monitor:
            self.driver.start()

            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                processor.start()
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(processor.node))

            connected_message = "Discovered group coordinator"
            with processor.node.account.monitor_log(
                    processor.LOG_FILE) as log_monitor:
                with processor.node.account.monitor_log(
                        processor.STDOUT_FILE) as stdout_monitor:
                    self.perform_broker_upgrade(to_version)

                    log_monitor.wait_until(
                        connected_message,
                        timeout_sec=120,
                        err_msg=("Never saw output '%s' on " %
                                 connected_message) +
                        str(processor.node.account))

                    stdout_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on" % self.processed_msg
                        + str(processor.node.account))

            # SmokeTestDriver allows up to 6 minutes to consume all
            # records for the verification step so this timeout is set to
            # 6 minutes (360 seconds) for consuming of verification records
            # and a very conservative additional 2 minutes (120 seconds) to process
            # the records in the verification step
            driver_monitor.wait_until(
                'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED',
                timeout_sec=480,
                err_msg="Never saw output '%s' on" %
                'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' +
                str(self.driver.node.account))

        self.driver.stop()
        processor.stop()
        processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                           processor.STDOUT_FILE,
                                           allow_fail=False)
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0 and
    subsequently bumped in 2.0.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }

    processed_msg = "processed [0-9]* records"
    base_version_number = str(DEV_VERSION).split("-")[0]

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @matrix(from_version=smoke_test_versions,
            to_version=dev_version,
            bounce_type=["full"])
    def test_app_upgrade(self, from_version, to_version, bounce_type):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      'echo': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'data': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'max': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sum': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'dif': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'cnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'avg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'wcnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'tagg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      }
                                  })
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor2 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor3 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)

        self.purge_state_dir(self.processor1)
        self.purge_state_dir(self.processor2)
        self.purge_state_dir(self.processor3)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        if bounce_type == "rolling":
            counter = 1
            random.seed()
            # upgrade one-by-one via rolling bounce
            random.shuffle(self.processors)
            for p in self.processors:
                p.CLEAN_NODE_ENABLED = False
                self.do_stop_start_bounce(p, None, to_version, counter)
                counter = counter + 1
        elif bounce_type == "full":
            self.restart_all_nodes_with(to_version)
        else:
            raise Exception("Unrecognized bounce_type: " + str(bounce_type))

        # shutdown
        self.driver.stop()

        # Ideally, we would actually verify the expected results.
        # See KAFKA-10202

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "SMOKE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " +
                    str(node.account))

    def start_all_nodes_with(self, version):

        self.set_version(self.processor1, version)
        self.set_version(self.processor2, version)
        self.set_version(self.processor3, version)

        self.processor1.start()
        self.processor2.start()
        self.processor3.start()

        # double-check the version
        kafka_version_str = self.get_version_string(version)
        self.wait_for_verification(self.processor1, kafka_version_str,
                                   self.processor1.LOG_FILE)
        self.wait_for_verification(self.processor2, kafka_version_str,
                                   self.processor2.LOG_FILE)
        self.wait_for_verification(self.processor3, kafka_version_str,
                                   self.processor3.LOG_FILE)

        # wait for the members to join
        self.wait_for_verification(self.processor1,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor3.STDOUT_FILE)

        # make sure they've processed something
        self.wait_for_verification(self.processor1, self.processed_msg,
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, self.processed_msg,
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, self.processed_msg,
                                   self.processor3.STDOUT_FILE)

    def restart_all_nodes_with(self, version):
        self.processor1.stop_node(self.processor1.node)
        self.processor2.stop_node(self.processor2.node)
        self.processor3.stop_node(self.processor3.node)

        # make sure the members have stopped
        self.wait_for_verification(self.processor1, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor3.STDOUT_FILE)

        self.roll_logs(self.processor1, ".1-1")
        self.roll_logs(self.processor2, ".1-1")
        self.roll_logs(self.processor3, ".1-1")

        self.set_version(self.processor1, version)
        self.set_version(self.processor2, version)
        self.set_version(self.processor3, version)

        self.processor1.start_node(self.processor1.node)
        self.processor2.start_node(self.processor2.node)
        self.processor3.start_node(self.processor3.node)

        # double-check the version
        kafka_version_str = self.get_version_string(version)
        self.wait_for_verification(self.processor1, kafka_version_str,
                                   self.processor1.LOG_FILE)
        self.wait_for_verification(self.processor2, kafka_version_str,
                                   self.processor2.LOG_FILE)
        self.wait_for_verification(self.processor3, kafka_version_str,
                                   self.processor3.LOG_FILE)

        # wait for the members to join
        self.wait_for_verification(self.processor1,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor3.STDOUT_FILE)

        # make sure they've processed something
        self.wait_for_verification(self.processor1, self.processed_msg,
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, self.processed_msg,
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, self.processed_msg,
                                   self.processor3.STDOUT_FILE)

    def get_version_string(self, version):
        if version.startswith("0") or version.startswith("1") \
          or version.startswith("2.0") or version.startswith("2.1"):
            return "Kafka version : " + version
        elif "SNAPSHOT" in version:
            return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT"
        else:
            return "Kafka version: " + version

    def wait_for_verification(self, processor, message, file, num_lines=1):
        wait_until(lambda: self.verify_from_file(processor, message, file
                                                 ) >= num_lines,
                   timeout_sec=60,
                   err_msg="Did expect to read '%s' from %s" %
                   (message, processor.node.account))

    def verify_from_file(self, processor, message, file):
        result = processor.node.account.ssh_output("grep -E '%s' %s | wc -l" %
                                                   (message, file),
                                                   allow_fail=False)
        try:
            return int(result)
        except ValueError:
            self.logger.warn("Command failed with ValueError: " + result)
            return 0

    def set_version(self, processor, version):
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def purge_state_dir(self, processor):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        kafka_version_str = self.get_version_string(new_version)

        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop_node(processor.node)
                first_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(second_other_node.account))
        node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling bounces

        self.roll_logs(processor, roll_counter + str(counter))

        self.set_version(processor, new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start_node(processor.node)

                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " on " + str(node.account))
                        first_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg +
                            str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node.account))

    def roll_logs(self, processor, roll_suffix):
        processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                   processor.STDOUT_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                   processor.STDERR_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.LOG_FILE + " " +
                                   processor.LOG_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " +
                                   processor.CONFIG_FILE + roll_suffix,
                                   allow_fail=False)
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """
    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            '__consumer_offsets': {
                'partitions': 50,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
            self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node

        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-EXCEPTION %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)

        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" %
                                          self.driver.STDOUT_FILE,
                                          allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line

        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            broker_type=["leader", "controller"],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type,
                                sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered. 
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type,
                                         sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
    def test_app_upgrade(self, from_version, to_version, bounce_type):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      'echo': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'data': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'max': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sum': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'dif': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'cnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'avg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'wcnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'tagg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      }
                                  })
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor2 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor3 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)

        self.purge_state_dir(self.processor1)
        self.purge_state_dir(self.processor2)
        self.purge_state_dir(self.processor3)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        if bounce_type == "rolling":
            counter = 1
            random.seed()
            # upgrade one-by-one via rolling bounce
            random.shuffle(self.processors)
            for p in self.processors:
                p.CLEAN_NODE_ENABLED = False
                self.do_stop_start_bounce(p, None, to_version, counter)
                counter = counter + 1
        elif bounce_type == "full":
            self.restart_all_nodes_with(to_version)
        else:
            raise Exception("Unrecognized bounce_type: " + str(bounce_type))

        # shutdown
        self.driver.stop()

        # Ideally, we would actually verify the expected results.
        # See KAFKA-10202

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "SMOKE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " +
                    str(node.account))