Ejemplo n.º 1
0
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
            self.processor1.start()
Ejemplo n.º 2
0
    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("")  # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [
            self.processor1, self.processor2, self.processor3
        ]
        self.upgraded_processors = []

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(
                p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))
        self.driver.stop()
Ejemplo n.º 3
0
    def test_upgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker. 
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()
        
        # allow some time for topics to be created
        time.sleep(10)
        
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 4
0
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version,
                                      counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))
Ejemplo n.º 5
0
    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context,
                                               num_zk=1,
                                               num_brokers=3,
                                               topics={
                                                   'echo': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'data': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'min': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'max': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'sum': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'dif': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'cnt': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'avg': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'wcnt': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   },
                                                   'tagg': {
                                                       'partitions': 5,
                                                       'replication-factor': 1
                                                   }
                                               })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
Ejemplo n.º 6
0
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context, num_zk=1, num_brokers=1, topics={
            'echo' : { 'partitions': 5 },
            'data' : { 'partitions': 5 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(test_context, self.kafka)
Ejemplo n.º 7
0
    def test_upgrade_downgrade_streams(self, from_version, to_version):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.

        Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed
        if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh 
        (search for get_kafka()). For streams in particular, that means that someone has manually
        copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh.
        """
        if from_version != to_version:
            # Setup phase
            self.zk = ZookeeperService(self.test_context, num_nodes=1)
            self.zk.start()

            # number of nodes needs to be >= 3 for the smoke test
            self.kafka = KafkaService(self.test_context,
                                      num_nodes=3,
                                      zk=self.zk,
                                      version=KafkaVersion(from_version),
                                      topics=self.topics)
            self.kafka.start()

            # allow some time for topics to be created
            time.sleep(10)

            self.driver = StreamsSmokeTestDriverService(
                self.test_context, self.kafka)
            self.driver.node.version = KafkaVersion(from_version)
            self.driver.start()

            self.processor1 = StreamsSmokeTestJobRunnerService(
                self.test_context, self.kafka)
            self.processor1.node.version = KafkaVersion(from_version)
            self.processor1.start()

            time.sleep(15)

            self.perform_streams_upgrade(to_version)

            time.sleep(15)
            self.driver.wait()
            self.driver.stop()

            self.processor1.stop()

            self.driver.node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                                         self.driver.STDOUT_FILE,
                                         allow_fail=False)
            self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
    def setup_system(self):
         # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()
        
        self.kafka = KafkaService(self.test_context, num_nodes=self.replication,
                                  zk=self.zk, topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        
        self.driver.start()
        self.processor1.start()
    def __init__(self, test_context):
        super(StreamsMultipleRollingUpgradeTest, self).__init__(test_context,
                                                                topics={
                                                                    'echo': {'partitions': 5, 'replication-factor': 1},
                                                                    'data': {'partitions': 5, 'replication-factor': 1},
                                                                    'min': {'partitions': 5, 'replication-factor': 1},
                                                                    'max': {'partitions': 5, 'replication-factor': 1},
                                                                    'sum': {'partitions': 5, 'replication-factor': 1},
                                                                    'dif': {'partitions': 5, 'replication-factor': 1},
                                                                    'cnt': {'partitions': 5, 'replication-factor': 1},
                                                                    'avg': {'partitions': 5, 'replication-factor': 1},
                                                                    'wcnt': {'partitions': 5, 'replication-factor': 1},
                                                                    'tagg': {'partitions': 5, 'replication-factor': 1}
                                                                })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor_1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor_2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor_3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

        # already on trunk version at end of upgrades so get rid of it
        self.streams_downgrade_versions = self.streams_upgrade_versions[:-1]
        self.streams_downgrade_versions.reverse()

        self.processors = [self.processor_1, self.processor_2, self.processor_3]

        self.started = False
Ejemplo n.º 10
0
    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("") # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [self.processor1, self.processor2, self.processor3]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()
Ejemplo n.º 11
0
    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'min-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'min-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.test_context = test_context
        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
Ejemplo n.º 12
0
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()
Ejemplo n.º 13
0
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()
Ejemplo n.º 14
0
    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
Ejemplo n.º 15
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=2, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=8)
    def test_streams(self):
        """
        Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times.
        Ensure that all results (stats on values computed by Kafka Streams) are correct.
        """

        self.driver.start()

        self.processor1.start()
        self.processor2.start()

        time.sleep(15)

        self.processor3.start()
        self.processor1.stop()

        time.sleep(15)

        self.processor4.start()

        self.driver.wait()
        self.driver.stop()

        self.processor2.stop()
        self.processor3.stop()
        self.processor4.stop()

        node = self.driver.node
        node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 16
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=9)
    def test_streams(self):
        """
        Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times.
        Ensure that all results (stats on values computed by Kafka Streams) are correct.
        """

        self.driver.start()

        self.processor1.start()
        self.processor2.start()

        time.sleep(15)

        self.processor3.start()
        self.processor1.stop()

        time.sleep(15)

        self.processor4.start()

        self.driver.wait()
        self.driver.stop()

        self.processor2.stop()
        self.processor3.stop()
        self.processor4.stop()

        node = self.driver.node
        node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 17
0
class StreamsBounceTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=6)
    def test_bounce(self):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.
        """

        self.driver.start()

        self.processor1.start()

        time.sleep(15)

        self.processor1.abortThenRestart()

        time.sleep(15)

        # enable this after we add change log partition replicas
        #self.kafka.signal_leader("data")

        #time.sleep(15);

        self.processor1.abortThenRestart()

        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 18
0
class StreamsBounceTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 2 },
            'data' : { 'partitions': 5, 'replication-factor': 2 },
            'min' : { 'partitions': 5, 'replication-factor': 2 },
            'max' : { 'partitions': 5, 'replication-factor': 2 },
            'sum' : { 'partitions': 5, 'replication-factor': 2 },
            'dif' : { 'partitions': 5, 'replication-factor': 2 },
            'cnt' : { 'partitions': 5, 'replication-factor': 2 },
            'avg' : { 'partitions': 5, 'replication-factor': 2 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 2 },
            'tagg' : { 'partitions': 5, 'replication-factor': 2 }
        })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

    @cluster(num_nodes=6)
    def test_bounce(self):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.
        """

        self.driver.start()

        self.processor1.start()

        time.sleep(15)

        self.processor1.abortThenRestart()

        time.sleep(15)

        # enable this after we add change log partition replicas
        #self.kafka.signal_leader("data")

        #time.sleep(15);

        self.processor1.abortThenRestart()

        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 19
0
    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'min-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'min-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.test_context = test_context
        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
Ejemplo n.º 20
0
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()
Ejemplo n.º 21
0
class StreamsUpgradeTest(Test):
    """
    Tests rolling upgrades and downgrades of the Kafka Streams library.
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

    def perform_streams_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling streams upgrade")

        # get the node running the streams app
        node = self.processor1.node
        self.processor1.stop()

        # change it's version. This will automatically make it pick up a different
        # JAR when it starts again
        node.version = KafkaVersion(to_version)
        self.processor1.start()

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @parametrize(from_version=str(LATEST_0_10_1), to_version=str(DEV_BRANCH))
    @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH))
    @parametrize(from_version=str(LATEST_0_10_1),
                 to_version=str(LATEST_0_11_0))
    @parametrize(from_version=str(LATEST_0_10_2),
                 to_version=str(LATEST_0_11_0))
    @parametrize(from_version=str(LATEST_0_11_0),
                 to_version=str(LATEST_0_10_2))
    @parametrize(from_version=str(DEV_BRANCH), to_version=str(LATEST_0_10_2))
    def test_upgrade_downgrade_streams(self, from_version, to_version):
        """
        Start a smoke test client, then abort (kill -9) and restart it a few times.
        Ensure that all records are delivered.

        Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed
        if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh 
        (search for get_kafka()). For streams in particular, that means that someone has manually
        copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh.
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_streams_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @cluster(num_nodes=6)
    @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH))
    def test_upgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker. 
        """
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)
Ejemplo n.º 22
0
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 23
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """

    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo' : { 'partitions': 5 },
            'data' : { 'partitions': 5 },
        }
        self.leader = None

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)

    @ignore
    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions)
    @ignore
    @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions)
    @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until("Kafka version : " + version,
                                       timeout_sec=60,
                                       err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account))
                monitor.wait_until("processed 100 records from topic",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until("Kafka version : " + version,
                                           timeout_sec=60,
                                           err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account))
                    first_monitor.wait_until("processed 100 records from topic",
                                             timeout_sec=60,
                                             err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                    second_monitor.wait_until("processed 100 records from topic",
                                              timeout_sec=60,
                                              err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until("Kafka version : " + version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account))
                        first_monitor.wait_until("processed 100 records from topic",
                                                 timeout_sec=60,
                                                 err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                        second_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))
                        third_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_rolling_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until("processed 100 records from topic",
                                               timeout_sec=60,
                                               err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                second_other_monitor.wait_until("processed 100 records from topic",
                                                timeout_sec=60,
                                                err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
                    with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until("Kafka version : " + new_version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account))
                        first_other_monitor.wait_until("processed 100 records from topic",
                                                       timeout_sec=60,
                                                       err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                        found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        second_other_monitor.wait_until("processed 100 records from topic",
                                                        timeout_sec=60,
                                                        err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
                        found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        monitor.wait_until("processed 100 records from topic",
                                           timeout_sec=60,
                                           err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account))
Ejemplo n.º 24
0
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """

    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": 2} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} },
            '__consumer_offsets' : { 'partitions': 50, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": 2} }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL
            
        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True

        
    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
           self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        
        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
            
        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture("grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line
            
        
        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"],
            broker_type=["leader", "controller"],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered. 
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"],
            num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system() 

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
Ejemplo n.º 25
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'min-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'min-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.test_context = test_context
        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)

    @cluster(num_nodes=8)
    @matrix(eos=[True, False], crash=[True, False])
    def test_streams(self, eos, crash):
        #
        if eos:
            processor1 = StreamsSmokeTestEOSJobRunnerService(self.test_context, self.kafka)
            processor2 = StreamsSmokeTestEOSJobRunnerService(self.test_context, self.kafka)
            processor3 = StreamsSmokeTestEOSJobRunnerService(self.test_context, self.kafka)
        else:
            processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
            processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
            processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)



        with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1:
            processor1.start()
            monitor1.wait_until('REBALANCING -> RUNNING',
                               timeout_sec=60,
                               err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account)
                               )

            self.driver.start()

            monitor1.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor1.node.account)
                                )

            # make sure we're not already done processing (which would invalidate the test)
            self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

            processor1.stop_nodes(not crash)

        with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2:
            processor2.start()
            monitor2.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account)
                                )
            monitor2.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor2.node.account)
                                )

        # make sure we're not already done processing (which would invalidate the test)
        self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

        processor2.stop_nodes(not crash)

        with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3:
            processor3.start()
            monitor3.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account)
                                )
            # there should still be some data left for this processor to work on.
            monitor3.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor3.node.account)
                                )

        self.driver.wait()
        self.driver.stop()

        processor3.stop()

        if crash and not eos:
            self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        else:
            self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 26
0
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """
    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            '__consumer_offsets': {
                'partitions': 50,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True

    def setup_system(self, start_processor=True, num_threads=3):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka, "at_least_once", num_threads)

        self.driver.start()

        if (start_processor):
            self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node

        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-EXCEPTION %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)

        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" %
                                          self.driver.STDOUT_FILE,
                                          allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line

        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            broker_type=["leader", "controller"],
            num_threads=[1, 3],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type,
                                sleep_time_secs, num_threads):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered.
        We also add a single thread stream client to make sure we could get all partitions reassigned in
        next generation so to verify the partition lost is correctly triggered.
        """
        self.setup_system(num_threads=num_threads)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type,
                                         sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """

        # Set min.insync.replicas to 1 because in the last stage of the test there is only one broker left.
        # Otherwise the last offset commit will never succeed and time out and potentially take longer as
        # duration passed to the close method of the Kafka Streams client.
        self.topics['__consumer_offsets'] = {
            'partitions': 50,
            'replication-factor': self.replication,
            'configs': {
                "min.insync.replicas": 1
            }
        }

        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
Ejemplo n.º 27
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0 and
    subsequently bumped in 2.0.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }
        self.leader = None
        self.leader_counter = {}

    processed_msg = "processed [0-9]* records"
    base_version_number = str(DEV_VERSION).split("-")[0]

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions,
            to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.num_kafka_nodes = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.num_kafka_nodes,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)

        processor = StreamsSmokeTestJobRunnerService(self.test_context,
                                                     self.kafka)

        with self.driver.node.account.monitor_log(
                self.driver.STDOUT_FILE) as driver_monitor:
            self.driver.start()

            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                processor.start()
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(processor.node))

            connected_message = "Discovered group coordinator"
            with processor.node.account.monitor_log(
                    processor.LOG_FILE) as log_monitor:
                with processor.node.account.monitor_log(
                        processor.STDOUT_FILE) as stdout_monitor:
                    self.perform_broker_upgrade(to_version)

                    log_monitor.wait_until(
                        connected_message,
                        timeout_sec=120,
                        err_msg=("Never saw output '%s' on " %
                                 connected_message) +
                        str(processor.node.account))

                    stdout_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on" % self.processed_msg
                        + str(processor.node.account))

            # SmokeTestDriver allows up to 6 minutes to consume all
            # records for the verification step so this timeout is set to
            # 6 minutes (360 seconds) for consuming of verification records
            # and a very conservative additional 2 minutes (120 seconds) to process
            # the records in the verification step
            driver_monitor.wait_until(
                'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED',
                timeout_sec=480,
                err_msg="Never saw output '%s' on" %
                'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' +
                str(self.driver.node.account))

        self.driver.stop()
        processor.stop()
        processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                           processor.STDOUT_FILE,
                                           allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

    @matrix(from_version=metadata_1_versions,
            to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions,
            to_version=metadata_3_or_higher_versions)
    @matrix(from_version=metadata_2_versions,
            to_version=metadata_3_or_higher_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version,
                                      counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("")  # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [
            self.processor1, self.processor2, self.processor3
        ]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(
                p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

    def update_leader(self):
        self.leader = None
        retries = 10
        while retries > 0:
            for p in self.processors:
                found = list(
                    p.node.account.ssh_capture(
                        "grep \"Finished assignment for group\" %s" %
                        p.LOG_FILE,
                        allow_fail=True))
                if len(found) >= self.leader_counter[p] + 1:
                    if self.leader is not None:
                        raise Exception("Could not uniquely identify leader")
                    self.leader = p
                    self.leader_counter[p] = self.leader_counter[p] + 1

            if self.leader is None:
                retries = retries - 1
                time.sleep(5)
            else:
                break

        if self.leader is None:
            raise Exception("Could not identify leader")

    def get_version_string(self, version):
        if version.startswith("0") or version.startswith("1") \
          or version.startswith("2.0") or version.startswith("2.1"):
            return "Kafka version : " + version
        elif "SNAPSHOT" in version:
            return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT"
        else:
            return "Kafka version: " + version

    def start_all_nodes_with(self, version):
        kafka_version_str = self.get_version_string(version)

        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    kafka_version_str,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        kafka_version_str,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " on " + str(node2.account))
                    first_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on " %
                        self.processed_msg + str(node1.account))
                    second_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on " %
                        self.processed_msg + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " on " + str(node3.account))
                        first_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node1.account))
                        second_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node2.account))
                        third_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        kafka_version_str = self.get_version_string(new_version)

        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " on " + str(node.account))
                        first_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg +
                            str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node.account))

    def do_rolling_bounce(self, processor, counter, current_generation):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        with first_other_node.account.monitor_log(
                first_other_processor.LOG_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.LOG_FILE) as second_other_monitor:
                # stop processor
                processor.stop()
                node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                         processor.STDOUT_FILE,
                                         allow_fail=False)

                node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                 processor.STDOUT_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                 processor.STDERR_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.LOG_FILE + " " +
                                 processor.LOG_FILE + "." + str(counter),
                                 allow_fail=False)
                self.leader_counter[processor] = 0

                with node.account.monitor_log(
                        processor.LOG_FILE) as log_monitor:
                    processor.set_upgrade_to("future_version")
                    processor.start()
                    self.old_processors.remove(processor)
                    self.upgraded_processors.append(processor)

                    # checking for the dev version which should be the only SNAPSHOT
                    log_monitor.wait_until(
                        "Kafka version.*" + self.base_version_number +
                        ".*SNAPSHOT",
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        str(DEV_VERSION) + " in " + str(node.account))
                    log_monitor.offset = 5
                    log_monitor.wait_until(
                        "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect FutureStreamsPartitionAssignor in " +
                        str(node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if processor == self.leader:
                        leader_monitor = log_monitor
                    elif first_other_processor == self.leader:
                        leader_monitor = first_other_monitor
                    elif second_other_processor == self.leader:
                        leader_monitor = second_other_monitor
                    else:
                        raise Exception("Could not identify leader.")

                    monitors = {}
                    monitors[processor] = log_monitor
                    monitors[first_other_processor] = first_other_monitor
                    monitors[second_other_processor] = second_other_monitor

                    leader_monitor.wait_until(
                        "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'version probing' attempt at leader "
                        + str(self.leader.node.account))

                    if len(self.old_processors) > 0:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing' at upgrading node "
                            + str(node.account))
                    else:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing with upgraded leader' at upgrading node "
                            + str(node.account))
                        first_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(first_other_node.account))
                        second_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(second_other_node.account))

                    log_monitor.wait_until(
                        "Version probing detected. Triggering new rebalance.",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'Triggering new rebalance' at upgrading node "
                        + str(node.account))

                    # version probing should trigger second rebalance
                    # now we check that after consecutive rebalances we have synchronized generation
                    generation_synchronized = False
                    retries = 0

                    while retries < 10:
                        processor_found = extract_generation_from_logs(
                            processor)
                        first_other_processor_found = extract_generation_from_logs(
                            first_other_processor)
                        second_other_processor_found = extract_generation_from_logs(
                            second_other_processor)

                        if len(processor_found) > 0 and len(
                                first_other_processor_found) > 0 and len(
                                    second_other_processor_found) > 0:
                            self.logger.info("processor: " +
                                             str(processor_found))
                            self.logger.info("first other processor: " +
                                             str(first_other_processor_found))
                            self.logger.info("second other processor: " +
                                             str(second_other_processor_found))

                            processor_generation = self.extract_highest_generation(
                                processor_found)
                            first_other_processor_generation = self.extract_highest_generation(
                                first_other_processor_found)
                            second_other_processor_generation = self.extract_highest_generation(
                                second_other_processor_found)

                            if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation:
                                current_generation = processor_generation
                                generation_synchronized = True
                                break

                        time.sleep(5)
                        retries = retries + 1

                    if generation_synchronized == False:
                        raise Exception(
                            "Never saw all three processors have the synchronized generation number"
                        )

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if self.leader in self.old_processors or len(
                            self.old_processors) > 0:
                        self.verify_metadata_no_upgraded_yet()

        return current_generation

    def extract_highest_generation(self, found_generations):
        return int(found_generations[-1])

    def verify_metadata_no_upgraded_yet(self):
        for p in self.processors:
            found = list(
                p.node.account.ssh_capture(
                    "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" "
                    + p.LOG_FILE,
                    allow_fail=True))
            if len(found) > 0:
                raise Exception(
                    "Kafka Streams failed with 'group member upgraded to metadata 4 too early'"
                )

    def confirm_topics_on_all_brokers(self, expected_topic_set):
        for node in self.kafka.nodes:
            match_count = 0
            # need to iterate over topic_list_generator as kafka.list_topics()
            # returns a python generator so values are fetched lazily
            # so we can't just compare directly we must iterate over what's returned
            topic_list_generator = self.kafka.list_topics(node=node)
            for topic in topic_list_generator:
                if topic in expected_topic_set:
                    match_count += 1

            if len(expected_topic_set) != match_count:
                return False

        return True
class StreamsMultipleRollingUpgradeTest(BaseStreamsTest):
    """
     This test will verify a rolling upgrade of multiple streams
     applications against all versions of streams against a single
     broker version.

     As new releases come out, just update the streams_upgrade_versions array to have the latest version
     included in the list.

     A prerequisite for this test to succeed
     is the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh
     (search for get_kafka()).
     As new versions are released the kafka/tests/kafkatest/version.py file
     needs to be updated as well.

     You can find what's been uploaded to S3 with the following command

     aws s3api list-objects --bucket kafka-packages --query 'Contents[].{Key:Key}
    """
    # adding new version to this list will cover broker and streams version
    streams_upgrade_versions = [str(LATEST_0_10_2), str(LATEST_0_11_0), str(LATEST_1_0), str(DEV_BRANCH)]

    def __init__(self, test_context):
        super(StreamsMultipleRollingUpgradeTest, self).__init__(test_context,
                                                                topics={
                                                                    'echo': {'partitions': 5, 'replication-factor': 1},
                                                                    'data': {'partitions': 5, 'replication-factor': 1},
                                                                    'min': {'partitions': 5, 'replication-factor': 1},
                                                                    'max': {'partitions': 5, 'replication-factor': 1},
                                                                    'sum': {'partitions': 5, 'replication-factor': 1},
                                                                    'dif': {'partitions': 5, 'replication-factor': 1},
                                                                    'cnt': {'partitions': 5, 'replication-factor': 1},
                                                                    'avg': {'partitions': 5, 'replication-factor': 1},
                                                                    'wcnt': {'partitions': 5, 'replication-factor': 1},
                                                                    'tagg': {'partitions': 5, 'replication-factor': 1}
                                                                })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor_1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor_2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)
        self.processor_3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka)

        # already on trunk version at end of upgrades so get rid of it
        self.streams_downgrade_versions = self.streams_upgrade_versions[:-1]
        self.streams_downgrade_versions.reverse()

        self.processors = [self.processor_1, self.processor_2, self.processor_3]

        self.started = False

    def setUp(self):
        self.zk.start()

    def upgrade_and_verify_start(self, processors, to_version):
        for processor in processors:
            self.logger.info("Updating node %s to version %s" % (processor.node.account, to_version))
            node = processor.node
            if self.started:
                self.stop(processor)
            node.version = KafkaVersion(to_version)
            processor.start()
            self.wait_for_verification(processor, "initializing processor: topic", processor.STDOUT_FILE)

        self.started = True

    def stop(self, processor):
        processor.stop()
        self.wait_for_verification(processor, "SMOKE-TEST-CLIENT-CLOSED", processor.STDOUT_FILE)

    def update_processors_and_verify(self, versions):
        for version in versions:
            self.upgrade_and_verify_start(self.processors, version)
        self.run_data_and_verify()

    def run_data_and_verify(self):
        self.driver.start()
        self.wait_for_verification(self.driver, "ALL-RECORDS-DELIVERED", self.driver.STDOUT_FILE)
        self.driver.stop()

    @ignore
    @cluster(num_nodes=9)
    @matrix(broker_version=streams_upgrade_versions)
    def test_rolling_upgrade_downgrade_multiple_apps(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        # verification step run after each upgrade
        self.update_processors_and_verify(self.streams_upgrade_versions)

        # with order reversed now we test downgrading, verification run after each downgrade
        self.update_processors_and_verify(self.streams_downgrade_versions)

        for processor in self.processors:
            self.stop(processor)
Ejemplo n.º 29
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """

    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo' : { 'partitions': 5 },
            'data' : { 'partitions': 5 },
        }
        self.leader = None
        self.leader_counter = {}

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions, to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr}},
            'data' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'min' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'max' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                      'configs': {"min.insync.replicas": self.isr} },
            'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} },
            'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication,
                       'configs': {"min.insync.replicas": self.isr} }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context, num_nodes=3,
                                  zk=self.zk, version=KafkaVersion(from_version), topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka)
        
        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
        self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    @matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions, to_version=metadata_3_or_higher_versions)
    @matrix(from_version=metadata_2_versions, to_version=metadata_3_or_higher_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context, num_nodes=1, zk=self.zk, topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("") # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [self.processor1, self.processor2, self.processor3]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until("UPGRADE-TEST-CLIENT-CLOSED",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on" + str(node.account))

        self.driver.stop()

    def update_leader(self):
        self.leader = None
        retries = 10
        while retries > 0:
            for p in self.processors:
                found = list(p.node.account.ssh_capture("grep \"Finished assignment for group\" %s" % p.LOG_FILE, allow_fail=True))
                if len(found) == self.leader_counter[p] + 1:
                    if self.leader is not None:
                        raise Exception("Could not uniquely identify leader")
                    self.leader = p
                    self.leader_counter[p] = self.leader_counter[p] + 1

            if self.leader is None:
                retries = retries - 1
                time.sleep(5)
            else:
                break

        if self.leader is None:
            raise Exception("Could not identify leader")

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until("Kafka version : " + version,
                                       timeout_sec=60,
                                       err_msg="Could not detect Kafka Streams version " + version + " " + str(node1.account))
                monitor.wait_until("processed 100 records from topic",
                                   timeout_sec=60,
                                   err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until("Kafka version : " + version,
                                           timeout_sec=60,
                                           err_msg="Could not detect Kafka Streams version " + version + " " + str(node2.account))
                    first_monitor.wait_until("processed 100 records from topic",
                                             timeout_sec=60,
                                             err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                    second_monitor.wait_until("processed 100 records from topic",
                                              timeout_sec=60,
                                              err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until("Kafka version : " + version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + version + " " + str(node3.account))
                        first_monitor.wait_until("processed 100 records from topic",
                                                 timeout_sec=60,
                                                 err_msg="Never saw output 'processed 100 records from topic' on" + str(node1.account))
                        second_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node2.account))
                        third_monitor.wait_until("processed 100 records from topic",
                                                  timeout_sec=60,
                                                  err_msg="Never saw output 'processed 100 records from topic' on" + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT, allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until("processed 100 records from topic",
                                               timeout_sec=60,
                                               err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                second_other_monitor.wait_until("processed 100 records from topic",
                                                timeout_sec=60,
                                                err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + roll_counter + str(counter), allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + roll_counter + str(counter), allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(first_other_processor.STDOUT_FILE) as first_other_monitor:
                    with second_other_node.account.monitor_log(second_other_processor.STDOUT_FILE) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until("Kafka version : " + new_version,
                                               timeout_sec=60,
                                               err_msg="Could not detect Kafka Streams version " + new_version + " " + str(node.account))
                        first_other_monitor.wait_until("processed 100 records from topic",
                                                       timeout_sec=60,
                                                       err_msg="Never saw output 'processed 100 records from topic' on" + str(first_other_node.account))
                        found = list(first_other_node.account.ssh_capture(grep_metadata_error + first_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        second_other_monitor.wait_until("processed 100 records from topic",
                                                        timeout_sec=60,
                                                        err_msg="Never saw output 'processed 100 records from topic' on" + str(second_other_node.account))
                        found = list(second_other_node.account.ssh_capture(grep_metadata_error + second_other_processor.STDERR_FILE, allow_fail=True))
                        if len(found) > 0:
                            raise Exception("Kafka Streams failed with 'unable to decode subscription data: version=2'")

                        monitor.wait_until("processed 100 records from topic",
                                           timeout_sec=60,
                                           err_msg="Never saw output 'processed 100 records from topic' on" + str(node.account))

    def do_rolling_bounce(self, processor, counter, current_generation):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        with first_other_node.account.monitor_log(first_other_processor.LOG_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(second_other_processor.LOG_FILE) as second_other_monitor:
                # stop processor
                processor.stop()
                node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)

                node.account.ssh("mv " + processor.STDOUT_FILE + " " + processor.STDOUT_FILE + "." + str(counter), allow_fail=False)
                node.account.ssh("mv " + processor.STDERR_FILE + " " + processor.STDERR_FILE + "." + str(counter), allow_fail=False)
                node.account.ssh("mv " + processor.LOG_FILE + " " + processor.LOG_FILE + "." + str(counter), allow_fail=False)
                self.leader_counter[processor] = 0

                with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                    processor.set_upgrade_to("future_version")
                    processor.start()
                    self.old_processors.remove(processor)
                    self.upgraded_processors.append(processor)

                    current_generation = current_generation + 1

                    log_monitor.wait_until("Kafka version : " + str(DEV_VERSION),
                                           timeout_sec=60,
                                           err_msg="Could not detect Kafka Streams version " + str(DEV_VERSION) + " in " + str(node.account))
                    log_monitor.offset = 5
                    log_monitor.wait_until("partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]",
                                           timeout_sec=60,
                                           err_msg="Could not detect FutureStreamsPartitionAssignor in " + str(node.account))

                    log_monitor.wait_until("Successfully joined group with generation " + str(current_generation),
                                           timeout_sec=60,
                                           err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(node.account))
                    first_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation),
                                                   timeout_sec=60,
                                                   err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(first_other_node.account))
                    second_other_monitor.wait_until("Successfully joined group with generation " + str(current_generation),
                                                    timeout_sec=60,
                                                    err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(second_other_node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1

                    if processor == self.leader:
                        leader_monitor = log_monitor
                    elif first_other_processor == self.leader:
                        leader_monitor = first_other_monitor
                    elif second_other_processor == self.leader:
                        leader_monitor = second_other_monitor
                    else:
                        raise Exception("Could not identify leader.")

                    monitors = {}
                    monitors[processor] = log_monitor
                    monitors[first_other_processor] = first_other_monitor
                    monitors[second_other_processor] = second_other_monitor

                    leader_monitor.wait_until("Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).",
                                              timeout_sec=60,
                                              err_msg="Could not detect 'version probing' attempt at leader " + str(self.leader.node.account))

                    if len(self.old_processors) > 0:
                        log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.",
                                               timeout_sec=60,
                                               err_msg="Could not detect 'successful version probing' at upgrading node " + str(node.account))
                    else:
                        log_monitor.wait_until("Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.",
                                               timeout_sec=60,
                                               err_msg="Could not detect 'successful version probing with upgraded leader' at upgrading node " + str(node.account))
                        first_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                                                       timeout_sec=60,
                                                       err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(first_other_node.account))
                        second_other_monitor.wait_until("Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                                                        timeout_sec=60,
                                                        err_msg="Never saw output 'Upgrade metadata to version 4' on" + str(second_other_node.account))

                    log_monitor.wait_until("Version probing detected. Triggering new rebalance.",
                                           timeout_sec=60,
                                           err_msg="Could not detect 'Triggering new rebalance' at upgrading node " + str(node.account))

                    # version probing should trigger second rebalance
                    current_generation = current_generation + 1

                    for p in self.processors:
                        monitors[p].wait_until("Successfully joined group with generation " + str(current_generation),
                                               timeout_sec=60,
                                               err_msg="Never saw output 'Successfully joined group with generation " + str(current_generation) + "' on" + str(p.node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[self.leader] = self.leader_counter[self.leader] + 1

                    if self.leader in self.old_processors or len(self.old_processors) > 0:
                        self.verify_metadata_no_upgraded_yet()

        return current_generation

    def verify_metadata_no_upgraded_yet(self):
        for p in self.processors:
            found = list(p.node.account.ssh_capture("grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" " + p.LOG_FILE, allow_fail=True))
            if len(found) > 0:
                raise Exception("Kafka Streams failed with 'group member upgraded to metadata 4 too early'")
Ejemplo n.º 30
0
class StreamsUpgradeTest(KafkaTest):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context,
                                                 num_zk=1,
                                                 num_brokers=1,
                                                 topics={
                                                     'echo': {
                                                         'partitions': 5
                                                     },
                                                     'data': {
                                                         'partitions': 5
                                                     }
                                                 })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            test_context, self.kafka)

    @parametrize(old_version=str(LATEST_0_10_1),
                 new_version=str(LATEST_0_10_2))
    @parametrize(old_version=str(LATEST_0_10_1), new_version=str(DEV_VERSION))
    @parametrize(old_version=str(LATEST_0_10_2), new_version=str(DEV_VERSION))
    def test_simple_upgrade(self, old_version, new_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_verion>
        """

        self.driver.start()
        self.start_all_nodes_with(old_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, "", new_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    #@parametrize(new_version=str(LATEST_0_10_1)) we cannot run this test until Kafka 0.10.1.2 is released
    #@parametrize(new_version=str(LATEST_0_10_2)) we cannot run this test until Kafka 0.10.2.2 is released
    @parametrize(new_version=str(DEV_VERSION))
    def test_metadata_upgrade(self, new_version):
        """
        Starts 3 KafkaStreams instances with version 0.10.0, and upgrades one-by-one to <new_version>
        """

        self.driver.start()
        self.start_all_nodes_with(str(LATEST_0_10_0))

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, "0.10.0", new_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, "", new_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    "Kafka version : " + version,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        "Kafka version : " + version,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " " + str(node2.account))
                    first_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node1.account))
                    second_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            "Kafka version : " + version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " " + str(node3.account))
                        first_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node1.account))
                        second_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node2.account))
                        third_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        processor.set_version(version)

    def do_rolling_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from == "":  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            "Kafka version : " + new_version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " " + str(node.account))
                        first_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node.account))
Ejemplo n.º 31
0
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)
Ejemplo n.º 32
0
class StreamsSmokeTest(KafkaTest):
    """
    Simple test of Kafka Streams.
    """

    def __init__(self, test_context):
        super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={
            'echo' : { 'partitions': 5, 'replication-factor': 1 },
            'data' : { 'partitions': 5, 'replication-factor': 1 },
            'min' : { 'partitions': 5, 'replication-factor': 1 },
            'min-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'min-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'max' : { 'partitions': 5, 'replication-factor': 1 },
            'sum' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-raw' : { 'partitions': 5, 'replication-factor': 1 },
            'sws-suppressed' : { 'partitions': 5, 'replication-factor': 1 },
            'dif' : { 'partitions': 5, 'replication-factor': 1 },
            'cnt' : { 'partitions': 5, 'replication-factor': 1 },
            'avg' : { 'partitions': 5, 'replication-factor': 1 },
            'wcnt' : { 'partitions': 5, 'replication-factor': 1 },
            'tagg' : { 'partitions': 5, 'replication-factor': 1 }
        })

        self.test_context = test_context
        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)

    @cluster(num_nodes=8)
    @matrix(processing_guarantee=['at_least_once'], crash=[True, False], metadata_quorum=quorum.all_non_upgrade)
    @matrix(processing_guarantee=['exactly_once', 'exactly_once_v2'], crash=[True, False])
    def test_streams(self, processing_guarantee, crash, metadata_quorum=quorum.zk):
        processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)
        processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)
        processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee)

        with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1:
            processor1.start()
            monitor1.wait_until('REBALANCING -> RUNNING',
                               timeout_sec=60,
                               err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account)
                               )

            self.driver.start()

            monitor1.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor1.node.account)
                                )

            # make sure we're not already done processing (which would invalidate the test)
            self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

            processor1.stop_nodes(not crash)

        with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2:
            processor2.start()
            monitor2.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account)
                                )
            monitor2.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor2.node.account)
                                )

        # make sure we're not already done processing (which would invalidate the test)
        self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False)

        processor2.stop_nodes(not crash)

        with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3:
            processor3.start()
            monitor3.wait_until('REBALANCING -> RUNNING',
                                timeout_sec=120,
                                err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account)
                                )
            # there should still be some data left for this processor to work on.
            monitor3.wait_until('processed',
                                timeout_sec=30,
                                err_msg="Didn't see any processing messages " + str(processor3.node.account)
                                )

        self.driver.wait()
        self.driver.stop()

        processor3.stop()

        if crash and processing_guarantee == 'at_least_once':
            self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False)
        else:
            self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
Ejemplo n.º 33
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }
        self.leader = None

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions,
            to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh("grep ALL-RECORDS-DELIVERED %s" %
                         self.driver.STDOUT_FILE,
                         allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    #@matrix(from_version=metadata_1_versions, to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions, to_version=metadata_3_versions)
    @matrix(from_version=metadata_2_versions, to_version=metadata_3_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_rolling_bounce(p, from_version[:-2], to_version, counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_rolling_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    "Kafka version : " + version,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        "Kafka version : " + version,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " " + str(node2.account))
                    first_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node1.account))
                    second_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            "Kafka version : " + version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " " + str(node3.account))
                        first_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node1.account))
                        second_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node2.account))
                        third_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_rolling_bounce(self, processor, upgrade_from, new_version, counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            "Kafka version : " + new_version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " " + str(node.account))
                        first_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node.account))
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0 and
    subsequently bumped in 2.0.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }

    processed_msg = "processed [0-9]* records"
    base_version_number = str(DEV_VERSION).split("-")[0]

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @cluster(num_nodes=6)
    @matrix(from_version=smoke_test_versions,
            to_version=dev_version,
            bounce_type=["full"])
    def test_app_upgrade(self, from_version, to_version, bounce_type):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      'echo': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'data': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'max': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sum': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'dif': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'cnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'avg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'wcnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'tagg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      }
                                  })
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor2 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor3 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)

        self.purge_state_dir(self.processor1)
        self.purge_state_dir(self.processor2)
        self.purge_state_dir(self.processor3)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        if bounce_type == "rolling":
            counter = 1
            random.seed()
            # upgrade one-by-one via rolling bounce
            random.shuffle(self.processors)
            for p in self.processors:
                p.CLEAN_NODE_ENABLED = False
                self.do_stop_start_bounce(p, None, to_version, counter)
                counter = counter + 1
        elif bounce_type == "full":
            self.restart_all_nodes_with(to_version)
        else:
            raise Exception("Unrecognized bounce_type: " + str(bounce_type))

        # shutdown
        self.driver.stop()

        # Ideally, we would actually verify the expected results.
        # See KAFKA-10202

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "SMOKE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " +
                    str(node.account))

    def start_all_nodes_with(self, version):

        self.set_version(self.processor1, version)
        self.set_version(self.processor2, version)
        self.set_version(self.processor3, version)

        self.processor1.start()
        self.processor2.start()
        self.processor3.start()

        # double-check the version
        kafka_version_str = self.get_version_string(version)
        self.wait_for_verification(self.processor1, kafka_version_str,
                                   self.processor1.LOG_FILE)
        self.wait_for_verification(self.processor2, kafka_version_str,
                                   self.processor2.LOG_FILE)
        self.wait_for_verification(self.processor3, kafka_version_str,
                                   self.processor3.LOG_FILE)

        # wait for the members to join
        self.wait_for_verification(self.processor1,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor3.STDOUT_FILE)

        # make sure they've processed something
        self.wait_for_verification(self.processor1, self.processed_msg,
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, self.processed_msg,
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, self.processed_msg,
                                   self.processor3.STDOUT_FILE)

    def restart_all_nodes_with(self, version):
        self.processor1.stop_node(self.processor1.node)
        self.processor2.stop_node(self.processor2.node)
        self.processor3.stop_node(self.processor3.node)

        # make sure the members have stopped
        self.wait_for_verification(self.processor1, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, "SMOKE-TEST-CLIENT-CLOSED",
                                   self.processor3.STDOUT_FILE)

        self.roll_logs(self.processor1, ".1-1")
        self.roll_logs(self.processor2, ".1-1")
        self.roll_logs(self.processor3, ".1-1")

        self.set_version(self.processor1, version)
        self.set_version(self.processor2, version)
        self.set_version(self.processor3, version)

        self.processor1.start_node(self.processor1.node)
        self.processor2.start_node(self.processor2.node)
        self.processor3.start_node(self.processor3.node)

        # double-check the version
        kafka_version_str = self.get_version_string(version)
        self.wait_for_verification(self.processor1, kafka_version_str,
                                   self.processor1.LOG_FILE)
        self.wait_for_verification(self.processor2, kafka_version_str,
                                   self.processor2.LOG_FILE)
        self.wait_for_verification(self.processor3, kafka_version_str,
                                   self.processor3.LOG_FILE)

        # wait for the members to join
        self.wait_for_verification(self.processor1,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3,
                                   "SMOKE-TEST-CLIENT-STARTED",
                                   self.processor3.STDOUT_FILE)

        # make sure they've processed something
        self.wait_for_verification(self.processor1, self.processed_msg,
                                   self.processor1.STDOUT_FILE)
        self.wait_for_verification(self.processor2, self.processed_msg,
                                   self.processor2.STDOUT_FILE)
        self.wait_for_verification(self.processor3, self.processed_msg,
                                   self.processor3.STDOUT_FILE)

    def get_version_string(self, version):
        if version.startswith("0") or version.startswith("1") \
          or version.startswith("2.0") or version.startswith("2.1"):
            return "Kafka version : " + version
        elif "SNAPSHOT" in version:
            return "Kafka version.*" + self.base_version_number + ".*SNAPSHOT"
        else:
            return "Kafka version: " + version

    def wait_for_verification(self, processor, message, file, num_lines=1):
        wait_until(lambda: self.verify_from_file(processor, message, file
                                                 ) >= num_lines,
                   timeout_sec=60,
                   err_msg="Did expect to read '%s' from %s" %
                   (message, processor.node.account))

    def verify_from_file(self, processor, message, file):
        result = processor.node.account.ssh_output("grep -E '%s' %s | wc -l" %
                                                   (message, file),
                                                   allow_fail=False)
        try:
            return int(result)
        except ValueError:
            self.logger.warn("Command failed with ValueError: " + result)
            return 0

    def set_version(self, processor, version):
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def purge_state_dir(self, processor):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        kafka_version_str = self.get_version_string(new_version)

        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop_node(processor.node)
                first_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(second_other_node.account))
        node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling bounces

        self.roll_logs(processor, roll_counter + str(counter))

        self.set_version(processor, new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start_node(processor.node)

                        log_monitor.wait_until(
                            kafka_version_str,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " on " + str(node.account))
                        first_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg +
                            str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            self.processed_msg,
                            timeout_sec=60,
                            err_msg="Never saw output '%s' on " %
                            self.processed_msg + str(node.account))

    def roll_logs(self, processor, roll_suffix):
        processor.node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                   processor.STDOUT_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                   processor.STDERR_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.LOG_FILE + " " +
                                   processor.LOG_FILE + roll_suffix,
                                   allow_fail=False)
        processor.node.account.ssh("mv " + processor.CONFIG_FILE + " " +
                                   processor.CONFIG_FILE + roll_suffix,
                                   allow_fail=False)
Ejemplo n.º 35
0
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.num_kafka_nodes = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.num_kafka_nodes,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        wait_until(lambda: self.confirm_topics_on_all_brokers(
            set(self.topics.keys())),
                   timeout_sec=60,
                   err_msg="Broker did not create all topics in 60 seconds ")

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)

        processor = StreamsSmokeTestJobRunnerService(self.test_context,
                                                     self.kafka)

        with self.driver.node.account.monitor_log(
                self.driver.STDOUT_FILE) as driver_monitor:
            self.driver.start()

            with processor.node.account.monitor_log(
                    processor.STDOUT_FILE) as monitor:
                processor.start()
                monitor.wait_until(
                    self.processed_msg,
                    timeout_sec=60,
                    err_msg="Never saw output '%s' on " % self.processed_msg +
                    str(processor.node))

            connected_message = "Discovered group coordinator"
            with processor.node.account.monitor_log(
                    processor.LOG_FILE) as log_monitor:
                with processor.node.account.monitor_log(
                        processor.STDOUT_FILE) as stdout_monitor:
                    self.perform_broker_upgrade(to_version)

                    log_monitor.wait_until(
                        connected_message,
                        timeout_sec=120,
                        err_msg=("Never saw output '%s' on " %
                                 connected_message) +
                        str(processor.node.account))

                    stdout_monitor.wait_until(
                        self.processed_msg,
                        timeout_sec=60,
                        err_msg="Never saw output '%s' on" % self.processed_msg
                        + str(processor.node.account))

            # SmokeTestDriver allows up to 6 minutes to consume all
            # records for the verification step so this timeout is set to
            # 6 minutes (360 seconds) for consuming of verification records
            # and a very conservative additional 2 minutes (120 seconds) to process
            # the records in the verification step
            driver_monitor.wait_until(
                'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED',
                timeout_sec=480,
                err_msg="Never saw output '%s' on" %
                'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' +
                str(self.driver.node.account))

        self.driver.stop()
        processor.stop()
        processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" %
                                           processor.STDOUT_FILE,
                                           allow_fail=False)
    def test_app_upgrade(self, from_version, to_version, bounce_type):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      'echo': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'data': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'min-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'max': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sum': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-raw': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'sws-suppressed': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'dif': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'cnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'avg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'wcnt': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      },
                                      'tagg': {
                                          'partitions': 5,
                                          'replication-factor': 1
                                      }
                                  })
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor2 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)
        self.processor3 = StreamsSmokeTestJobRunnerService(
            self.test_context,
            self.kafka,
            processing_guarantee="at_least_once",
            replication_factor=1)

        self.purge_state_dir(self.processor1)
        self.purge_state_dir(self.processor2)
        self.purge_state_dir(self.processor3)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        if bounce_type == "rolling":
            counter = 1
            random.seed()
            # upgrade one-by-one via rolling bounce
            random.shuffle(self.processors)
            for p in self.processors:
                p.CLEAN_NODE_ENABLED = False
                self.do_stop_start_bounce(p, None, to_version, counter)
                counter = counter + 1
        elif bounce_type == "full":
            self.restart_all_nodes_with(to_version)
        else:
            raise Exception("Unrecognized bounce_type: " + str(bounce_type))

        # shutdown
        self.driver.stop()

        # Ideally, we would actually verify the expected results.
        # See KAFKA-10202

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "SMOKE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'SMOKE-TEST-CLIENT-CLOSED' on " +
                    str(node.account))
Ejemplo n.º 37
0
class StreamsUpgradeTest(Test):
    """
    Test upgrading Kafka Streams (all version combination)
    If metadata was changes, upgrade is more difficult
    Metadata version was bumped in 0.10.1.0
    """
    def __init__(self, test_context):
        super(StreamsUpgradeTest, self).__init__(test_context)
        self.topics = {
            'echo': {
                'partitions': 5
            },
            'data': {
                'partitions': 5
            },
        }
        self.leader = None
        self.leader_counter = {}

    def perform_broker_upgrade(self, to_version):
        self.logger.info("First pass bounce - rolling broker upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = KafkaVersion(to_version)
            self.kafka.start_node(node)

    @ignore
    @cluster(num_nodes=6)
    @matrix(from_version=broker_upgrade_versions,
            to_version=broker_upgrade_versions)
    def test_upgrade_downgrade_brokers(self, from_version, to_version):
        """
        Start a smoke test client then perform rolling upgrades on the broker.
        """

        if from_version == to_version:
            return

        self.replication = 3
        self.partitions = 1
        self.isr = 2
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": self.isr
                }
            }
        }

        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        # number of nodes needs to be >= 3 for the smoke test
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  version=KafkaVersion(from_version),
                                  topics=self.topics)
        self.kafka.start()

        # allow some time for topics to be created
        time.sleep(10)

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.processor1.start()
        time.sleep(15)

        self.perform_broker_upgrade(to_version)

        time.sleep(15)
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node
        node.account.ssh(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        self.processor1.node.account.ssh_capture(
            "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE,
            allow_fail=False)

    @matrix(from_version=metadata_2_versions, to_version=metadata_2_versions)
    def test_simple_upgrade_downgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with <old_version>, and upgrades one-by-one to <new_version>
        """

        if from_version == to_version:
            return

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # upgrade one-by-one via rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    @matrix(from_version=metadata_1_versions,
            to_version=backward_compatible_metadata_2_versions)
    @matrix(from_version=metadata_1_versions,
            to_version=metadata_3_or_higher_versions)
    @matrix(from_version=metadata_2_versions,
            to_version=metadata_3_or_higher_versions)
    def test_metadata_upgrade(self, from_version, to_version):
        """
        Starts 3 KafkaStreams instances with version <from_version> and upgrades one-by-one to <to_version>
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with(from_version)

        self.processors = [self.processor1, self.processor2, self.processor3]

        counter = 1
        random.seed()

        # first rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            self.do_stop_start_bounce(p, from_version[:-2], to_version,
                                      counter)
            counter = counter + 1

        # second rolling bounce
        random.shuffle(self.processors)
        for p in self.processors:
            self.do_stop_start_bounce(p, None, to_version, counter)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def test_version_probing_upgrade(self):
        """
        Starts 3 KafkaStreams instances, and upgrades one-by-one to "future version"
        """

        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()

        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.driver.disable_auto_terminate()
        self.processor1 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor2 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)
        self.processor3 = StreamsUpgradeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()
        self.start_all_nodes_with("")  # run with TRUNK

        self.processors = [self.processor1, self.processor2, self.processor3]
        self.old_processors = [
            self.processor1, self.processor2, self.processor3
        ]
        self.upgraded_processors = []
        for p in self.processors:
            self.leader_counter[p] = 2

        self.update_leader()
        for p in self.processors:
            self.leader_counter[p] = 0
        self.leader_counter[self.leader] = 3

        counter = 1
        current_generation = 3

        random.seed()
        random.shuffle(self.processors)

        for p in self.processors:
            p.CLEAN_NODE_ENABLED = False
            current_generation = self.do_rolling_bounce(
                p, counter, current_generation)
            counter = counter + 1

        # shutdown
        self.driver.stop()
        self.driver.wait()

        random.shuffle(self.processors)
        for p in self.processors:
            node = p.node
            with node.account.monitor_log(p.STDOUT_FILE) as monitor:
                p.stop()
                monitor.wait_until(
                    "UPGRADE-TEST-CLIENT-CLOSED",
                    timeout_sec=60,
                    err_msg="Never saw output 'UPGRADE-TEST-CLIENT-CLOSED' on"
                    + str(node.account))

        self.driver.stop()

    def update_leader(self):
        self.leader = None
        retries = 10
        while retries > 0:
            for p in self.processors:
                found = list(
                    p.node.account.ssh_capture(
                        "grep \"Finished assignment for group\" %s" %
                        p.LOG_FILE,
                        allow_fail=True))
                if len(found) == self.leader_counter[p] + 1:
                    if self.leader is not None:
                        raise Exception("Could not uniquely identify leader")
                    self.leader = p
                    self.leader_counter[p] = self.leader_counter[p] + 1

            if self.leader is None:
                retries = retries - 1
                time.sleep(5)
            else:
                break

        if self.leader is None:
            raise Exception("Could not identify leader")

    def start_all_nodes_with(self, version):
        # start first with <version>
        self.prepare_for(self.processor1, version)
        node1 = self.processor1.node
        with node1.account.monitor_log(self.processor1.STDOUT_FILE) as monitor:
            with node1.account.monitor_log(
                    self.processor1.LOG_FILE) as log_monitor:
                self.processor1.start()
                log_monitor.wait_until(
                    "Kafka version : " + version,
                    timeout_sec=60,
                    err_msg="Could not detect Kafka Streams version " +
                    version + " " + str(node1.account))
                monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(node1.account))

        # start second with <version>
        self.prepare_for(self.processor2, version)
        node2 = self.processor2.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node2.account.monitor_log(
                        self.processor2.LOG_FILE) as log_monitor:
                    self.processor2.start()
                    log_monitor.wait_until(
                        "Kafka version : " + version,
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        version + " " + str(node2.account))
                    first_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node1.account))
                    second_monitor.wait_until(
                        "processed 100 records from topic",
                        timeout_sec=60,
                        err_msg=
                        "Never saw output 'processed 100 records from topic' on"
                        + str(node2.account))

        # start third with <version>
        self.prepare_for(self.processor3, version)
        node3 = self.processor3.node
        with node1.account.monitor_log(
                self.processor1.STDOUT_FILE) as first_monitor:
            with node2.account.monitor_log(
                    self.processor2.STDOUT_FILE) as second_monitor:
                with node3.account.monitor_log(
                        self.processor3.STDOUT_FILE) as third_monitor:
                    with node3.account.monitor_log(
                            self.processor3.LOG_FILE) as log_monitor:
                        self.processor3.start()
                        log_monitor.wait_until(
                            "Kafka version : " + version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            version + " " + str(node3.account))
                        first_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node1.account))
                        second_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node2.account))
                        third_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node3.account))

    @staticmethod
    def prepare_for(processor, version):
        processor.node.account.ssh("rm -rf " + processor.PERSISTENT_ROOT,
                                   allow_fail=False)
        if version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(version)

    def do_stop_start_bounce(self, processor, upgrade_from, new_version,
                             counter):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        # stop processor and wait for rebalance of others
        with first_other_node.account.monitor_log(
                first_other_processor.STDOUT_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.STDOUT_FILE
            ) as second_other_monitor:
                processor.stop()
                first_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(first_other_node.account))
                second_other_monitor.wait_until(
                    "processed 100 records from topic",
                    timeout_sec=60,
                    err_msg=
                    "Never saw output 'processed 100 records from topic' on" +
                    str(second_other_node.account))
        node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                 processor.STDOUT_FILE,
                                 allow_fail=False)

        if upgrade_from is None:  # upgrade disabled -- second round of rolling bounces
            roll_counter = ".1-"  # second round of rolling bounces
        else:
            roll_counter = ".0-"  # first  round of rolling boundes

        node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                         processor.STDOUT_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.STDERR_FILE + " " +
                         processor.STDERR_FILE + roll_counter + str(counter),
                         allow_fail=False)
        node.account.ssh("mv " + processor.LOG_FILE + " " +
                         processor.LOG_FILE + roll_counter + str(counter),
                         allow_fail=False)

        if new_version == str(DEV_VERSION):
            processor.set_version("")  # set to TRUNK
        else:
            processor.set_version(new_version)
        processor.set_upgrade_from(upgrade_from)

        grep_metadata_error = "grep \"org.apache.kafka.streams.errors.TaskAssignmentException: unable to decode subscription data: version=2\" "
        with node.account.monitor_log(processor.STDOUT_FILE) as monitor:
            with node.account.monitor_log(processor.LOG_FILE) as log_monitor:
                with first_other_node.account.monitor_log(
                        first_other_processor.STDOUT_FILE
                ) as first_other_monitor:
                    with second_other_node.account.monitor_log(
                            second_other_processor.STDOUT_FILE
                    ) as second_other_monitor:
                        processor.start()

                        log_monitor.wait_until(
                            "Kafka version : " + new_version,
                            timeout_sec=60,
                            err_msg="Could not detect Kafka Streams version " +
                            new_version + " " + str(node.account))
                        first_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(first_other_node.account))
                        found = list(
                            first_other_node.account.ssh_capture(
                                grep_metadata_error +
                                first_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        second_other_monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(second_other_node.account))
                        found = list(
                            second_other_node.account.ssh_capture(
                                grep_metadata_error +
                                second_other_processor.STDERR_FILE,
                                allow_fail=True))
                        if len(found) > 0:
                            raise Exception(
                                "Kafka Streams failed with 'unable to decode subscription data: version=2'"
                            )

                        monitor.wait_until(
                            "processed 100 records from topic",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'processed 100 records from topic' on"
                            + str(node.account))

    def do_rolling_bounce(self, processor, counter, current_generation):
        first_other_processor = None
        second_other_processor = None
        for p in self.processors:
            if p != processor:
                if first_other_processor is None:
                    first_other_processor = p
                else:
                    second_other_processor = p

        node = processor.node
        first_other_node = first_other_processor.node
        second_other_node = second_other_processor.node

        with first_other_node.account.monitor_log(
                first_other_processor.LOG_FILE) as first_other_monitor:
            with second_other_node.account.monitor_log(
                    second_other_processor.LOG_FILE) as second_other_monitor:
                # stop processor
                processor.stop()
                node.account.ssh_capture("grep UPGRADE-TEST-CLIENT-CLOSED %s" %
                                         processor.STDOUT_FILE,
                                         allow_fail=False)

                node.account.ssh("mv " + processor.STDOUT_FILE + " " +
                                 processor.STDOUT_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.STDERR_FILE + " " +
                                 processor.STDERR_FILE + "." + str(counter),
                                 allow_fail=False)
                node.account.ssh("mv " + processor.LOG_FILE + " " +
                                 processor.LOG_FILE + "." + str(counter),
                                 allow_fail=False)
                self.leader_counter[processor] = 0

                with node.account.monitor_log(
                        processor.LOG_FILE) as log_monitor:
                    processor.set_upgrade_to("future_version")
                    processor.start()
                    self.old_processors.remove(processor)
                    self.upgraded_processors.append(processor)

                    log_monitor.wait_until(
                        "Kafka version : " + str(DEV_VERSION),
                        timeout_sec=60,
                        err_msg="Could not detect Kafka Streams version " +
                        str(DEV_VERSION) + " in " + str(node.account))
                    log_monitor.offset = 5
                    log_monitor.wait_until(
                        "partition\.assignment\.strategy = \[org\.apache\.kafka\.streams\.tests\.StreamsUpgradeTest$FutureStreamsPartitionAssignor\]",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect FutureStreamsPartitionAssignor in " +
                        str(node.account))

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if processor == self.leader:
                        leader_monitor = log_monitor
                    elif first_other_processor == self.leader:
                        leader_monitor = first_other_monitor
                    elif second_other_processor == self.leader:
                        leader_monitor = second_other_monitor
                    else:
                        raise Exception("Could not identify leader.")

                    monitors = {}
                    monitors[processor] = log_monitor
                    monitors[first_other_processor] = first_other_monitor
                    monitors[second_other_processor] = second_other_monitor

                    leader_monitor.wait_until(
                        "Received a future (version probing) subscription (version: 5). Sending empty assignment back (with supported version 4).",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'version probing' attempt at leader "
                        + str(self.leader.node.account))

                    if len(self.old_processors) > 0:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Downgrading subscription metadata to received version and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing' at upgrading node "
                            + str(node.account))
                    else:
                        log_monitor.wait_until(
                            "Sent a version 5 subscription and got version 4 assignment back (successful version probing). Setting subscription metadata to leaders supported version 5 and trigger new rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Could not detect 'successful version probing with upgraded leader' at upgrading node "
                            + str(node.account))
                        first_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(first_other_node.account))
                        second_other_monitor.wait_until(
                            "Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.",
                            timeout_sec=60,
                            err_msg=
                            "Never saw output 'Upgrade metadata to version 4' on"
                            + str(second_other_node.account))

                    log_monitor.wait_until(
                        "Version probing detected. Triggering new rebalance.",
                        timeout_sec=60,
                        err_msg=
                        "Could not detect 'Triggering new rebalance' at upgrading node "
                        + str(node.account))

                    # version probing should trigger second rebalance
                    # now we check that after consecutive rebalances we have synchronized generation
                    generation_synchronized = False
                    retries = 0

                    while retries < 10:
                        processor_found = self.extract_generation_from_logs(
                            processor)
                        first_other_processor_found = self.extract_generation_from_logs(
                            first_other_processor)
                        second_other_processor_found = self.extract_generation_from_logs(
                            second_other_processor)

                        if len(processor_found) > 0 and len(
                                first_other_processor_found) > 0 and len(
                                    second_other_processor_found) > 0:
                            self.logger.info("processor: " +
                                             str(processor_found))
                            self.logger.info("first other processor: " +
                                             str(first_other_processor_found))
                            self.logger.info("second other processor: " +
                                             str(second_other_processor_found))

                            processor_generation = self.extract_highest_generation(
                                processor_found)
                            first_other_processor_generation = self.extract_highest_generation(
                                first_other_processor_found)
                            second_other_processor_generation = self.extract_highest_generation(
                                second_other_processor_found)

                            if processor_generation == first_other_processor_generation and processor_generation == second_other_processor_generation:
                                current_generation = processor_generation
                                generation_synchronized = True
                                break

                        time.sleep(5)
                        retries = retries + 1

                    if generation_synchronized == False:
                        raise Exception(
                            "Never saw all three processors have the synchronized generation number"
                        )

                    if processor == self.leader:
                        self.update_leader()
                    else:
                        self.leader_counter[
                            self.leader] = self.leader_counter[self.leader] + 1

                    if self.leader in self.old_processors or len(
                            self.old_processors) > 0:
                        self.verify_metadata_no_upgraded_yet()

        return current_generation

    def extract_generation_from_logs(self, processor):
        return list(
            processor.node.account.ssh_capture(
                "grep \"Successfully joined group with generation\" %s| awk \'{for(i=1;i<=NF;i++) {if ($i == \"generation\") beginning=i+1; if($i== \"(org.apache.kafka.clients.consumer.internals.AbstractCoordinator)\") ending=i }; for (j=beginning;j<ending;j++) printf $j; printf \"\\n\"}\'"
                % processor.LOG_FILE,
                allow_fail=True))

    def extract_highest_generation(self, found_generations):
        return int(found_generations[-1])

    def verify_metadata_no_upgraded_yet(self):
        for p in self.processors:
            found = list(
                p.node.account.ssh_capture(
                    "grep \"Sent a version 4 subscription and group leader.s latest supported version is 5. Upgrading subscription metadata version to 5 for next rebalance.\" "
                    + p.LOG_FILE,
                    allow_fail=True))
            if len(found) > 0:
                raise Exception(
                    "Kafka Streams failed with 'group member upgraded to metadata 4 too early'"
                )
class StreamsMultipleRollingUpgradeTest(BaseStreamsTest):
    """
     This test will verify a rolling upgrade of multiple streams
     applications against all versions of streams against a single
     broker version.

     As new releases come out, just update the streams_upgrade_versions array to have the latest version
     included in the list.

     A prerequisite for this test to succeed
     is the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh
     (search for get_kafka()).
     As new versions are released the kafka/tests/kafkatest/version.py file
     needs to be updated as well.

     You can find what's been uploaded to S3 with the following command

     aws s3api list-objects --bucket kafka-packages --query 'Contents[].{Key:Key}
    """
    # adding new version to this list will cover broker and streams version
    streams_upgrade_versions = [
        str(LATEST_0_10_2),
        str(LATEST_0_11_0),
        str(LATEST_1_0),
        str(LATEST_1_1),
        str(LATEST_2_0),
        str(LATEST_2_1),
        str(LATEST_2_2),
        str(LATEST_2_3),
        str(DEV_BRANCH)
    ]

    def __init__(self, test_context):
        super(StreamsMultipleRollingUpgradeTest,
              self).__init__(test_context,
                             topics={
                                 'echo': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'data': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'min': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'max': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'sum': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'dif': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'cnt': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'avg': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'wcnt': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'tagg': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 }
                             })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor_1 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor_2 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor_3 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)

        # already on trunk version at end of upgrades so get rid of it
        self.streams_downgrade_versions = self.streams_upgrade_versions[:-1]
        self.streams_downgrade_versions.reverse()

        self.processors = [
            self.processor_1, self.processor_2, self.processor_3
        ]

        self.started = False

    def setUp(self):
        self.zk.start()

    def upgrade_and_verify_start(self, processors, to_version):
        for processor in processors:
            self.logger.info("Updating node %s to version %s" %
                             (processor.node.account, to_version))
            node = processor.node
            if self.started:
                self.stop(processor)
            node.version = KafkaVersion(to_version)
            processor.start()
            self.wait_for_verification(processor,
                                       "initializing processor: topic",
                                       processor.STDOUT_FILE)

        self.started = True

    def stop(self, processor):
        processor.stop()
        self.wait_for_verification(processor, "SMOKE-TEST-CLIENT-CLOSED",
                                   processor.STDOUT_FILE)

    def update_processors_and_verify(self, versions):
        for version in versions:
            self.upgrade_and_verify_start(self.processors, version)
        self.run_data_and_verify()

    def run_data_and_verify(self):
        self.driver.start()
        self.wait_for_verification(self.driver, "ALL-RECORDS-DELIVERED",
                                   self.driver.STDOUT_FILE)
        self.driver.stop()

    @ignore
    @cluster(num_nodes=9)
    @matrix(broker_version=streams_upgrade_versions)
    def test_rolling_upgrade_downgrade_multiple_apps(self, broker_version):
        self.kafka.set_version(KafkaVersion(broker_version))
        self.kafka.start()

        # verification step run after each upgrade
        self.update_processors_and_verify(self.streams_upgrade_versions)

        # with order reversed now we test downgrading, verification run after each downgrade
        self.update_processors_and_verify(self.streams_downgrade_versions)

        for processor in self.processors:
            self.stop(processor)
class StreamsBrokerBounceTest(Test):
    """
    Simple test of Kafka Streams with brokers failing
    """
    def __init__(self, test_context):
        super(StreamsBrokerBounceTest, self).__init__(test_context)
        self.replication = 3
        self.partitions = 3
        self.topics = {
            'echo': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'data': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'min': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'max': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'sum': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'dif': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'cnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'avg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'wcnt': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            'tagg': {
                'partitions': self.partitions,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            },
            '__consumer_offsets': {
                'partitions': 50,
                'replication-factor': self.replication,
                'configs': {
                    "min.insync.replicas": 2
                }
            }
        }

    def fail_broker_type(self, failure_mode, broker_type):
        # Pick a random topic and bounce it's leader
        topic_index = randint(0, len(self.topics.keys()) - 1)
        topic = self.topics.keys()[topic_index]
        failures[failure_mode](self, topic, broker_type)

    def fail_many_brokers(self, failure_mode, num_failures):
        sig = signal.SIGTERM
        if (failure_mode == "clean_shutdown"):
            sig = signal.SIGTERM
        else:
            sig = signal.SIGKILL

        for num in range(0, num_failures - 1):
            signal_node(self, self.kafka.nodes[num], sig)

    def setup_system(self, start_processor=True):
        # Setup phase
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.zk.start()

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=self.replication,
                                  zk=self.zk,
                                  topics=self.topics)
        self.kafka.start()
        # Start test harness
        self.driver = StreamsSmokeTestDriverService(self.test_context,
                                                    self.kafka)
        self.processor1 = StreamsSmokeTestJobRunnerService(
            self.test_context, self.kafka)

        self.driver.start()

        if (start_processor):
            self.processor1.start()

    def collect_results(self, sleep_time_secs):
        data = {}
        # End test
        self.driver.wait()
        self.driver.stop()

        self.processor1.stop()

        node = self.driver.node

        # Success is declared if streams does not crash when sleep time > 0
        # It should give an exception when sleep time is 0 since we kill the brokers immediately
        # and the topic manager cannot create internal topics with the desired replication factor
        if (sleep_time_secs == 0):
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-EXCEPTION %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)
        else:
            output_streams = self.processor1.node.account.ssh_capture(
                "grep SMOKE-TEST-CLIENT-CLOSED %s" %
                self.processor1.STDOUT_FILE,
                allow_fail=False)

        for line in output_streams:
            data["Client closed"] = line

        # Currently it is hard to guarantee anything about Kafka since we don't have exactly once.
        # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS
        output = node.account.ssh_capture(
            "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s"
            % self.driver.STDOUT_FILE,
            allow_fail=False)
        for line in output:
            data["Records Delivered"] = line
        output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" %
                                          self.driver.STDOUT_FILE,
                                          allow_fail=False)
        for line in output:
            data["Logic Success/Failure"] = line

        return data

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            broker_type=["leader", "controller"],
            sleep_time_secs=[120])
    def test_broker_type_bounce(self, failure_mode, broker_type,
                                sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker and ensure data is still received
        Record if records are delivered. 
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        return self.collect_results(sleep_time_secs)

    @ignore
    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_shutdown"],
            broker_type=["controller"],
            sleep_time_secs=[0])
    def test_broker_type_bounce_at_start(self, failure_mode, broker_type,
                                         sleep_time_secs):
        """
        Start a smoke test client, then kill one particular broker immediately before streams stats
        Streams should throw an exception since it cannot create topics with the desired
        replication factor of 3
        """
        self.setup_system(start_processor=False)

        # Sleep to allow test to run for a bit
        time.sleep(sleep_time_secs)

        # Fail brokers
        self.fail_broker_type(failure_mode, broker_type)

        self.processor1.start()

        return self.collect_results(sleep_time_secs)

    @cluster(num_nodes=7)
    @matrix(failure_mode=[
        "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"
    ],
            num_failures=[2])
    def test_many_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)

    @cluster(num_nodes=7)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3])
    def test_all_brokers_bounce(self, failure_mode, num_failures):
        """
        Start a smoke test client, then kill a few brokers and ensure data is still received
        Record if records are delivered
        """
        self.setup_system()

        # Sleep to allow test to run for a bit
        time.sleep(120)

        # Fail brokers
        self.fail_many_brokers(failure_mode, num_failures)

        return self.collect_results(120)
    def __init__(self, test_context):
        super(StreamsMultipleRollingUpgradeTest,
              self).__init__(test_context,
                             topics={
                                 'echo': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'data': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'min': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'max': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'sum': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'dif': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'cnt': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'avg': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'wcnt': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 },
                                 'tagg': {
                                     'partitions': 5,
                                     'replication-factor': 1
                                 }
                             })

        self.driver = StreamsSmokeTestDriverService(test_context, self.kafka)
        self.processor_1 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor_2 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)
        self.processor_3 = StreamsSmokeTestJobRunnerService(
            test_context, self.kafka)

        # already on trunk version at end of upgrades so get rid of it
        self.streams_downgrade_versions = self.streams_upgrade_versions[:-1]
        self.streams_downgrade_versions.reverse()

        self.processors = [
            self.processor_1, self.processor_2, self.processor_3
        ]

        self.started = False