Beispiel #1
0
class TestUpgrade(ProduceConsumeValidateTest):

    def __init__(self, test_context):
        super(TestUpgrade, self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: {
                                                                    "partitions": 3,
                                                                    "replication-factor": 3,
                                                                    'configs': {"min.insync.replicas": 2}}})
        self.zk.start()
        self.kafka.start()

        # Producer and consumer
        self.producer_throughput = 10000
        self.num_producers = 1
        self.num_consumers = 1
        self.producer = VerifiableProducer(
            self.test_context, self.num_producers, self.kafka, self.topic,
            throughput=self.producer_throughput, version=LATEST_0_8_2)

        # TODO - reduce the timeout
        self.consumer = ConsoleConsumer(
            self.test_context, self.num_consumers, self.kafka, self.topic,
            consumer_timeout_ms=30000, message_validator=is_int, version=LATEST_0_8_2)

    def perform_upgrade(self):
        self.logger.info("First pass bounce - rolling upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = TRUNK
            node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X"
            self.kafka.start_node(node)

        self.logger.info("Second pass bounce - remove inter.broker.protocol.version config")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION]
            self.kafka.start_node(node)

    def test_upgrade(self):
        """Test upgrade of Kafka broker cluster from 0.8.2 to 0.9.0

        - Start 3 node broker cluster on version 0.8.2
        - Start producer and consumer in the background
        - Perform two-phase rolling upgrade
            - First phase: upgrade brokers to 0.9.0 with inter.broker.protocol.version set to 0.8.2.X
            - Second phase: remove inter.broker.protocol.version config with rolling bounce
        - Finally, validate that every message acked by the producer was consumed by the consumer
        """

        self.run_produce_consume_validate(core_test_action=self.perform_upgrade)
class TestUpgrade(ProduceConsumeValidateTest):

    def __init__(self, test_context):
        super(TestUpgrade, self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: {
                                                                    "partitions": 3,
                                                                    "replication-factor": 3,
                                                                    'configs': {"min.insync.replicas": 2}}})
        self.zk.start()
        self.kafka.start()

        # Producer and consumer
        self.producer_throughput = 10000
        self.num_producers = 1
        self.num_consumers = 1
        self.producer = VerifiableProducer(
            self.test_context, self.num_producers, self.kafka, self.topic,
            throughput=self.producer_throughput, version=LATEST_0_8_2)

        # TODO - reduce the timeout
        self.consumer = ConsoleConsumer(
            self.test_context, self.num_consumers, self.kafka, self.topic,
            consumer_timeout_ms=30000, message_validator=is_int, version=LATEST_0_8_2)

    def perform_upgrade(self):
        self.logger.info("First pass bounce - rolling upgrade")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            node.version = TRUNK
            node.config[config_property.INTER_BROKER_PROTOCOL_VERSION] = "0.8.2.X"
            self.kafka.start_node(node)

        self.logger.info("Second pass bounce - remove inter.broker.protocol.version config")
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            del node.config[config_property.INTER_BROKER_PROTOCOL_VERSION]
            self.kafka.start_node(node)

    def test_upgrade(self):
        """Test upgrade of Kafka broker cluster from 0.8.2 to 0.9.0

        - Start 3 node broker cluster on version 0.8.2
        - Start producer and consumer in the background
        - Perform two-phase rolling upgrade
            - First phase: upgrade brokers to 0.9.0 with inter.broker.protocol.version set to 0.8.2.X
            - Second phase: remove inter.broker.protocol.version config with rolling bounce
        - Finally, validate that every message acked by the producer was consumed by the consumer
        """

        self.run_produce_consume_validate(core_test_action=self.perform_upgrade)
Beispiel #3
0
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest):
    """Tests a rolling upgrade from PLAINTEXT to a secured cluster
    """

    def __init__(self, test_context):
        super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context)

    def setUp(self):
        self.acls = ACLs(self.test_context)
        self.topic = "test_topic"
        self.group = "group"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: {
            "partitions": 3,
            "replication-factor": 3,
            'configs': {"min.insync.replicas": 2}}})
        self.zk.start()

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(
            self.test_context, self.num_producers, self.kafka, self.topic,
            throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(
            self.test_context, self.num_consumers, self.kafka, self.topic,
            consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True)

        self.consumer.group_id = "group"

    def bounce(self):
        self.kafka.start_minikdc()
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)
            time.sleep(10)

    def roll_in_secured_settings(self, client_protocol, broker_protocol):

        # Roll cluster to include inter broker security protocol.
        self.kafka.interbroker_security_protocol = broker_protocol
        self.kafka.open_port(client_protocol)
        self.kafka.open_port(broker_protocol)
        self.bounce()

        # Roll cluster to disable PLAINTEXT port
        self.kafka.close_port('PLAINTEXT')
        self.set_authorizer_and_bounce(client_protocol, broker_protocol)

    def set_authorizer_and_bounce(self, client_protocol, broker_protocol):
        self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER
        self.acls.set_acls(client_protocol, self.kafka, self.zk, self.topic, self.group)
        self.acls.set_acls(broker_protocol, self.kafka, self.zk, self.topic, self.group)
        self.bounce()

    def open_secured_port(self, client_protocol):
        self.kafka.security_protocol = client_protocol
        self.kafka.open_port(client_protocol)
        self.kafka.start_minikdc()
        self.bounce()

    def add_sasl_mechanism(self, new_client_sasl_mechanism):
        self.kafka.client_sasl_mechanism = new_client_sasl_mechanism
        self.kafka.start_minikdc()
        self.bounce()

    def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism):
        # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism.
        self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism
        self.bounce()

        # Bounce again with ACLs for new mechanism
        self.set_authorizer_and_bounce(security_protocol, security_protocol)

    @matrix(client_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"])
    def test_rolling_upgrade_phase_one(self, client_protocol):
        """
        Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce
        and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port.
        """
        self.kafka.interbroker_security_protocol = "PLAINTEXT"
        self.kafka.security_protocol = "PLAINTEXT"
        self.kafka.start()

        # Create PLAINTEXT producer and consumer
        self.create_producer_and_consumer()

        # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run
        self.run_produce_consume_validate(self.open_secured_port, client_protocol)

        # Now we can produce and consume via the secured port
        self.kafka.security_protocol = client_protocol
        self.create_producer_and_consumer()
        self.run_produce_consume_validate(lambda: time.sleep(1))

    @matrix(client_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"], broker_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"])
    def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol):
        """
        Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one).
        Start an Producer and Consumer via the SECURED port
        Incrementally upgrade to add inter-broker be the secure protocol
        Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port
        Ensure the producer and consumer ran throughout
        """
        #Given we have a broker that has both secure and PLAINTEXT ports open
        self.kafka.security_protocol = client_protocol
        self.kafka.interbroker_security_protocol = "PLAINTEXT"
        self.kafka.start()

        #Create Secured Producer and Consumer
        self.create_producer_and_consumer()

        #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout
        self.run_produce_consume_validate(self.roll_in_secured_settings, client_protocol, broker_protocol)

    @parametrize(new_client_sasl_mechanism='PLAIN')
    def test_rolling_upgrade_sasl_mechanism_phase_one(self, new_client_sasl_mechanism):
        """
        Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce
        and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism.
        """
        self.kafka.interbroker_security_protocol = "SASL_SSL"
        self.kafka.security_protocol = "SASL_SSL"
        self.kafka.client_sasl_mechanism = "GSSAPI"
        self.kafka.interbroker_sasl_mechanism = "GSSAPI"
        self.kafka.start()

        # Create SASL/GSSAPI producer and consumer
        self.create_producer_and_consumer()

        # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run
        self.run_produce_consume_validate(self.add_sasl_mechanism, new_client_sasl_mechanism)

        # Now we can produce and consume using the new SASL mechanism
        self.kafka.client_sasl_mechanism = new_client_sasl_mechanism
        self.create_producer_and_consumer()
        self.run_produce_consume_validate(lambda: time.sleep(1))

    @parametrize(new_sasl_mechanism='PLAIN')
    def test_rolling_upgrade_sasl_mechanism_phase_two(self, new_sasl_mechanism):
        """
        Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one).
        Start Producer and Consumer using the second mechanism
        Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI
        Incrementally upgrade again to add ACLs
        Ensure the producer and consumer run throughout
        """
        #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients
        self.kafka.security_protocol = "SASL_SSL"
        self.kafka.interbroker_security_protocol = "SASL_SSL"
        self.kafka.client_sasl_mechanism = new_sasl_mechanism
        self.kafka.interbroker_sasl_mechanism = "GSSAPI"
        self.kafka.start()

        #Create Producer and Consumer using second mechanism
        self.create_producer_and_consumer()

        #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout
        self.run_produce_consume_validate(self.roll_in_sasl_mechanism, self.kafka.security_protocol, new_sasl_mechanism)
class StreamsBrokerDownResilience(Test):
    """
    This test validates that Streams is resilient to a broker
    being down longer than specified timeouts in configs
    """

    inputTopic = "streamsResilienceSource"
    outputTopic = "streamsResilienceSink"
    num_messages = 5

    def __init__(self, test_context):
        super(StreamsBrokerDownResilience,
              self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.inputTopic: {
                                          'partitions': 3,
                                          'replication-factor': 1
                                      },
                                      self.outputTopic: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      }
                                  })

    def get_consumer(self, num_messages):
        return VerifiableConsumer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.outputTopic,
                                  "stream-broker-resilience-verify-consumer",
                                  max_messages=num_messages)

    def get_producer(self, num_messages):
        return VerifiableProducer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.inputTopic,
                                  max_messages=num_messages,
                                  acks=1)

    def assert_produce_consume(self, test_state, num_messages=5):
        producer = self.get_producer(num_messages)
        producer.start()

        wait_until(lambda: producer.num_acked >= num_messages,
                   timeout_sec=30,
                   err_msg="At %s failed to send messages " % test_state)

        consumer = self.get_consumer(num_messages)
        consumer.start()

        wait_until(
            lambda: consumer.total_consumed() >= num_messages,
            timeout_sec=60,
            err_msg="At %s streams did not process messages in 60 seconds " %
            test_state)

    @staticmethod
    def get_configs(extra_configs=""):
        # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout)
        consumer_poll_ms = "consumer.max.poll.interval.ms=50000"
        retries_config = "producer.retries=2"
        request_timeout = "producer.request.timeout.ms=15000"
        max_block_ms = "producer.max.block.ms=30000"

        # java code expects configs in key=value,key=value format
        updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms + extra_configs

        return updated_configs

    def wait_for_verification(self, processor, message, file, num_lines=1):
        wait_until(lambda: self.verify_from_file(processor, message, file
                                                 ) >= num_lines,
                   timeout_sec=60,
                   err_msg="Did expect to read '%s' from %s" %
                   (message, processor.node.account))

    @staticmethod
    def verify_from_file(processor, message, file):
        result = processor.node.account.ssh_output("grep '%s' %s | wc -l" %
                                                   (message, file),
                                                   allow_fail=False)
        return int(result)

    def setUp(self):
        self.zk.start()

    def test_streams_resilient_to_broker_down(self):
        self.kafka.start()

        # Broker should be down over 2x of retries * timeout ms
        # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds
        broker_down_time_in_seconds = 70

        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka,
                                                       self.get_configs())
        processor.start()

        # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down
        # After KIP-91 is merged we'll continue to send messages the duration of the test
        self.assert_produce_consume("before_broker_stop")

        node = self.kafka.leader(self.inputTopic)

        self.kafka.stop_node(node)

        time.sleep(broker_down_time_in_seconds)

        self.kafka.start_node(node)

        self.assert_produce_consume("after_broker_stop")

        self.kafka.stop()

    def test_streams_runs_with_broker_down_initially(self):
        self.kafka.start()
        node = self.kafka.leader(self.inputTopic)
        self.kafka.stop_node(node)

        configs = self.get_configs(
            extra_configs=",application.id=starting_wo_broker_id")

        # start streams with broker down initially
        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka, configs)
        processor.start()

        processor_2 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_2.start()

        processor_3 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_3.start()

        broker_unavailable_message = "Broker may not be available"

        # verify streams instances unable to connect to broker, kept trying
        self.wait_for_verification(processor, broker_unavailable_message,
                                   processor.LOG_FILE, 100)
        self.wait_for_verification(processor_2, broker_unavailable_message,
                                   processor_2.LOG_FILE, 100)
        self.wait_for_verification(processor_3, broker_unavailable_message,
                                   processor_3.LOG_FILE, 100)

        # now start broker
        self.kafka.start_node(node)

        # assert streams can process when starting with broker down
        self.assert_produce_consume("running_with_broker_down_initially",
                                    num_messages=9)

        message = "processed3messages"
        # need to show all 3 instances processed messages
        self.wait_for_verification(processor, message, processor.STDOUT_FILE)
        self.wait_for_verification(processor_2, message,
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3, message,
                                   processor_3.STDOUT_FILE)

        self.kafka.stop()

    def test_streams_should_scale_in_while_brokers_down(self):
        self.kafka.start()

        configs = self.get_configs(
            extra_configs=",application.id=shutdown_with_broker_down")

        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka, configs)
        processor.start()

        processor_2 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_2.start()

        processor_3 = StreamsBrokerDownResilienceService(
            self.test_context, self.kafka, configs)
        processor_3.start()

        # need to wait for rebalance  once
        self.wait_for_verification(
            processor_3, "State transition from REBALANCING to RUNNING",
            processor_3.LOG_FILE)

        # assert streams can process when starting with broker down
        self.assert_produce_consume("waiting for rebalance to complete",
                                    num_messages=9)

        message = "processed3messages"

        self.wait_for_verification(processor, message, processor.STDOUT_FILE)
        self.wait_for_verification(processor_2, message,
                                   processor_2.STDOUT_FILE)
        self.wait_for_verification(processor_3, message,
                                   processor_3.STDOUT_FILE)

        node = self.kafka.leader(self.inputTopic)
        self.kafka.stop_node(node)

        processor.stop()
        processor_2.stop()

        shutdown_message = "Complete shutdown of streams resilience test app now"
        self.wait_for_verification(processor, shutdown_message,
                                   processor.STDOUT_FILE)
        self.wait_for_verification(processor_2, shutdown_message,
                                   processor_2.STDOUT_FILE)

        self.kafka.start_node(node)

        self.assert_produce_consume(
            "sending_message_after_stopping_streams_instance_bouncing_broker",
            num_messages=9)

        self.wait_for_verification(processor_3, "processed9messages",
                                   processor_3.STDOUT_FILE)

        self.kafka.stop()
Beispiel #5
0
class TestSnapshots(ProduceConsumeValidateTest):

    TOPIC_NAME_PREFIX = "test_topic_"

    def __init__(self, test_context):
        super(TestSnapshots, self).__init__(test_context=test_context)
        self.topics_created = 0
        self.topic = "test_topic"
        self.partitions = 3
        self.replication_factor = 3
        self.num_nodes = 3

        # Producer and consumer
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

        security_protocol = 'PLAINTEXT'
        # Setup Custom Config to ensure snapshot will be generated deterministically
        self.kafka = KafkaService(
            self.test_context,
            self.num_nodes,
            zk=None,
            topics={
                self.topic: {
                    "partitions": self.partitions,
                    "replication-factor": self.replication_factor,
                    'configs': {
                        "min.insync.replicas": 2
                    }
                }
            },
            server_prop_overrides=[
                [
                    config_property.METADATA_LOG_DIR,
                    KafkaService.METADATA_LOG_DIR
                ], [config_property.METADATA_LOG_SEGMENT_MS, "10000"],
                [config_property.METADATA_LOG_RETENTION_BYTES, "2048"],
                [config_property.METADATA_LOG_BYTES_BETWEEN_SNAPSHOTS, "2048"]
            ])

        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.security_protocol = security_protocol

    def setUp(self):
        # Start the cluster and ensure that a snapshot is generated
        self.logger.info(
            "Starting the cluster and running until snapshot creation")

        assert quorum.for_test(self.test_context) in quorum.all_kraft, \
                "Snapshot test should be run Kraft Modes only"

        self.kafka.start()

        topic_count = 10
        self.topics_created += self.create_n_topics(topic_count)

        if self.kafka.remote_controller_quorum:
            self.controller_nodes = self.kafka.remote_controller_quorum.nodes
        else:
            self.controller_nodes = self.kafka.nodes[:self.kafka.
                                                     num_nodes_controller_role]

        # Waiting for snapshot creation and first log segment
        # cleanup on all controller nodes
        for node in self.controller_nodes:
            self.logger.debug("Waiting for snapshot on: %s" %
                              self.kafka.who_am_i(node))
            self.wait_for_log_segment_delete(node)
            self.wait_for_snapshot(node)
        self.logger.debug("Verified Snapshots exist on controller nodes")

    def create_n_topics(self, topic_count):
        for i in range(self.topics_created, topic_count):
            topic = "%s%d" % (TestSnapshots.TOPIC_NAME_PREFIX, i)
            self.logger.debug("Creating topic %s" % topic)
            topic_cfg = {
                "topic": topic,
                "partitions": self.partitions,
                "replication-factor": self.replication_factor,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
            self.kafka.create_topic(topic_cfg)
        self.logger.debug("Created %d more topics" % topic_count)
        return topic_count

    def wait_for_log_segment_delete(self, node):
        file_path = self.kafka.METADATA_FIRST_LOG
        # Wait until the first log segment in metadata log is marked for deletion
        wait_until(
            lambda: not self.file_exists(node, file_path),
            timeout_sec=100,
            backoff_sec=1,
            err_msg=
            "Not able to verify cleanup of log file %s in a reasonable amount of time"
            % file_path)

    def wait_for_snapshot(self, node):
        # Wait for a snapshot file to show up
        file_path = self.kafka.METADATA_SNAPSHOT_SEARCH_STR
        wait_until(
            lambda: self.file_exists(node, file_path),
            timeout_sec=100,
            backoff_sec=1,
            err_msg=
            "Not able to verify snapshot existence in a reasonable amount of time"
        )

    def file_exists(self, node, file_path):
        # Check if the first log segment is cleaned up
        self.logger.debug("Checking if file %s exists" % file_path)
        cmd = "ls %s" % file_path
        files = node.account.ssh_output(cmd,
                                        allow_fail=True,
                                        combine_stderr=False)

        if len(files) is 0:
            self.logger.debug("File %s does not exist" % file_path)
            return False
        else:
            self.logger.debug("File %s was found" % file_path)
            return True

    def validate_success(self, topic=None):
        if topic is None:
            # Create a new topic
            topic = "%s%d" % (TestSnapshots.TOPIC_NAME_PREFIX,
                              self.topics_created)
            self.topics_created += self.create_n_topics(topic_count=1)

        # Produce to the newly created topic to ensure broker has caught up
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           topic,
                                           throughput=self.producer_throughput,
                                           message_validator=is_int)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        topic,
                                        consumer_timeout_ms=30000,
                                        message_validator=is_int)
        self.start_producer_and_consumer()
        self.stop_producer_and_consumer()
        self.validate()

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_kraft)
    def test_broker(self, metadata_quorum=quorum.colocated_kraft):
        """ Test the ability of a broker to consume metadata snapshots
        and to recover the cluster metadata state using them

        The test ensures that that there is atleast one snapshot created on
        the controller quorum during the setup phase and that at least the first
        log segment in the metadata log has been marked for deletion, thereby ensuring
        that any observer of the log needs to always load a snapshot to catch
        up to the current metadata state.

        Each scenario is a progression over the previous one.
        The scenarios build on top of each other by:
        * Loading a snapshot
        * Loading and snapshot and some delta records
        * Loading a snapshot and delta and ensuring that the most recent metadata state
          has been caught up.

        Even though a subsequent scenario covers the previous one, they are all
        left in the test to make debugging a failure of the test easier
        e.g. if the first scenario passes and the second fails, it hints towards
        a problem with the application of delta records while catching up
        """

        # Scenario -- Re-init broker after cleaning up all persistent state
        node = random.choice(self.kafka.nodes)
        self.logger.debug("Scenario: kill-clean-start on broker node %s",
                          self.kafka.who_am_i(node))
        self.kafka.clean_node(node)
        self.kafka.start_node(node)

        # Scenario -- Re-init broker after cleaning up all persistent state
        # Create some metadata changes for the broker to consume as well.
        node = random.choice(self.kafka.nodes)
        self.logger.debug(
            "Scenario: kill-clean-create_topics-start on broker node %s",
            self.kafka.who_am_i(node))
        self.kafka.clean_node(node)
        # Now modify the cluster to create more metadata changes
        self.topics_created += self.create_n_topics(topic_count=10)
        self.kafka.start_node(node)

        # Scenario -- Re-init broker after cleaning up all persistent state
        # And ensure that the broker has replicated the metadata log
        node = random.choice(self.kafka.nodes)
        self.logger.debug(
            "Scenario: kill-clean-start-verify-produce on broker node %s",
            self.kafka.who_am_i(node))
        self.kafka.clean_node(node)
        self.kafka.start_node(node)
        # Create a topic where the affected broker must be the leader
        broker_topic = "%s%d" % (TestSnapshots.TOPIC_NAME_PREFIX,
                                 self.topics_created)
        self.topics_created += 1
        self.logger.debug("Creating topic %s" % broker_topic)
        topic_cfg = {
            "topic": broker_topic,
            "replica-assignment": self.kafka.idx(node),
            "configs": {
                "min.insync.replicas": 1
            }
        }
        self.kafka.create_topic(topic_cfg)

        # Produce to the newly created topic and make sure it works.
        self.validate_success(broker_topic)

    @cluster(num_nodes=9)
    @matrix(metadata_quorum=quorum.all_kraft)
    def test_controller(self, metadata_quorum=quorum.colocated_kraft):
        """ Test the ability of controllers to consume metadata snapshots
        and to recover the cluster metadata state using them

        The test ensures that that there is atleast one snapshot created on
        the controller quorum during the setup phase and that at least the first
        log segment in the metadata log has been marked for deletion, thereby ensuring
        that any observer of the log needs to always load a snapshot to catch
        up to the current metadata state.

        Each scenario is a progression over the previous one.
        The scenarios build on top of each other by:
        * Loading a snapshot
        * Loading and snapshot and some delta records
        * Loading a snapshot and delta and ensuring that the most recent metadata state
          has been caught up.

        Even though a subsequent scenario covers the previous one, they are all
        left in the test to make debugging a failure of the test easier
        e.g. if the first scenario passes and the second fails, it hints towards
        a problem with the application of delta records while catching up
        """

        # Scenario -- Re-init controllers with a clean kafka dir
        self.logger.debug("Scenario: kill-clean-start controller node")
        for node in self.controller_nodes:
            self.logger.debug("Restarting node: %s",
                              self.kafka.controller_quorum.who_am_i(node))
            self.kafka.controller_quorum.clean_node(node)
            self.kafka.controller_quorum.start_node(node)

        # Scenario -- Re-init controllers with a clean kafka dir and
        # make metadata changes while they are down.
        # This will force the entire quorum to load from snapshots
        # and verify the quorum's ability to catch up to the latest metadata
        self.logger.debug(
            "Scenario: kill-clean-create_topics-start on controller node %s")
        for node in self.controller_nodes:
            self.logger.debug("Restarting node: %s",
                              self.kafka.controller_quorum.who_am_i(node))
            self.kafka.controller_quorum.clean_node(node)
            # Now modify the cluster to create more metadata changes
            self.topics_created += self.create_n_topics(topic_count=5)
            self.kafka.controller_quorum.start_node(node)

        # Produce to a newly created topic and make sure it works.
        self.validate_success()
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest):
    """Tests a rolling upgrade from PLAINTEXT to a secured cluster
    """

    def __init__(self, test_context):
        super(TestSecurityRollingUpgrade, self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: {
            "partitions": 3,
            "replication-factor": 3,
            'configs': {"min.insync.replicas": 2}}})
        self.zk.start()

        #reduce replica.lag.time.max.ms due to KAFKA-2827
        self.kafka.replica_lag = 2000

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(
            self.test_context, self.num_producers, self.kafka, self.topic,
            throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(
            self.test_context, self.num_consumers, self.kafka, self.topic,
            consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True)

        self.consumer.group_id = "unique-test-group-" + str(random.random())

    def bounce(self):
        #Sleeps reduce the intermittent failures reported in KAFKA-2891. Should be removed once resolved.
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            time.sleep(10)
            self.kafka.start_node(node)
            time.sleep(10)

    def roll_in_secured_settings(self, upgrade_protocol):
        self.kafka.interbroker_security_protocol = upgrade_protocol

        # Roll cluster to include inter broker security protocol.
        self.kafka.open_port(upgrade_protocol)
        self.bounce()

        # Roll cluster to disable PLAINTEXT port
        self.kafka.close_port('PLAINTEXT')
        self.bounce()

    def open_secured_port(self, upgrade_protocol):
        self.kafka.security_protocol = upgrade_protocol
        self.kafka.open_port(upgrade_protocol)
        self.kafka.start_minikdc()
        self.bounce()

    @matrix(upgrade_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"])
    def test_rolling_upgrade_phase_one(self, upgrade_protocol):
        """
        Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce
        and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port.
        """
        self.kafka.interbroker_security_protocol = "PLAINTEXT"
        self.kafka.security_protocol = "PLAINTEXT"
        self.kafka.start()

        #Create PLAINTEXT producer and consumer
        self.create_producer_and_consumer()

        # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run
        self.run_produce_consume_validate(self.open_secured_port, upgrade_protocol)

        # Now we can produce and consume via the secured port
        self.kafka.security_protocol = upgrade_protocol
        self.create_producer_and_consumer()
        self.run_produce_consume_validate(lambda: time.sleep(1))

    @matrix(upgrade_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"])
    def test_rolling_upgrade_phase_two(self, upgrade_protocol):
        """
        Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one).
        Start an Producer and Consumer via the SECURED port
        Rolling upgrade to add inter-broker be the secure protocol
        Rolling upgrade again to disable PLAINTEXT
        Ensure the producer and consumer ran throughout
        """
        #Given we have a broker that has both secure and PLAINTEXT ports open
        self.kafka.security_protocol = upgrade_protocol
        self.kafka.interbroker_security_protocol = "PLAINTEXT"
        self.kafka.start()

        #Create Secured Producer and Consumer
        self.create_producer_and_consumer()

        #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout
        self.run_produce_consume_validate(self.roll_in_secured_settings, upgrade_protocol)
Beispiel #7
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750
        self.first_transactional_id = "my-first-transactional-id"
        self.second_transactional_id = "my-second-transactional-id"
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  topics={
                                      self.input_topic: {
                                          "partitions":
                                          self.num_input_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      },
                                      self.output_topic: {
                                          "partitions":
                                          self.num_output_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })

    def setUp(self):
        self.zk.start()

    def seed_messages(self):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=self.input_topic,
                                           message_validator=is_int,
                                           max_messages=self.num_seed_messages,
                                           enable_idempotence=True)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_output_topic(self):
        consumer = self.start_consumer(self.output_topic,
                                       group_id="verifying_consumer")
        return self.drain_consumer(consumer)

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self
                           .kafka.is_registered(node),
                           timeout_sec=self.kafka.zk_session_timeout + 5,
                           err_msg="Failed to see timely deregistration of \
                           hard-killed broker %s" % str(node.account))
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_partition,
                                        transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=self.input_topic,
            input_partition=input_partition,
            output_topic=self.output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=30,
                           err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                           % (copier.transactional_id, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self):
        copiers = []
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=0,
                transactional_id=self.first_transactional_id))
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=1,
                transactional_id=self.second_transactional_id))
        return copiers

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=topic_to_read,
                                   group_id=group_id,
                                   new_consumer=True,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" %\
                   60)
        return consumer

    def drain_consumer(self, consumer):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= self.num_seed_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" %\
                   (len(consumer.messages_consumed[1]), self.num_seed_messages, 90))
        consumer.stop()
        return consumer.messages_consumed[1]

    def copy_messages_transactionally(self, failure_mode, bounce_target):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers()
        concurrent_consumer = self.start_consumer(
            self.output_topic, group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=60,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 60))
        self.logger.info("finished copying messages")
        return self.drain_consumer(concurrent_consumer)

    @cluster(num_nodes=9)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"])
    def test_transactions(self, failure_mode, bounce_target):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"][
            "collect_default"] = True
        self.kafka.start()
        input_messages = self.seed_messages()
        concurrently_consumed_messages = self.copy_messages_transactionally(
            failure_mode, bounce_target)
        output_messages = self.get_messages_from_output_topic()

        concurrently_consumed_message_set = set(concurrently_consumed_messages)
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)

        num_dups = abs(len(output_messages) - len(output_message_set))
        num_dups_in_concurrent_consumer = abs(
            len(concurrently_consumed_messages) -
            len(concurrently_consumed_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))

        assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
        assert input_message_set == concurrently_consumed_message_set, \
            "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\
            (len(input_message_set), len(concurrently_consumed_message_set))
class StreamsBrokerDownResilience(Test):
    """
    This test validates that Streams is resilient to a broker
    being down longer than specified timeouts in configs
    """

    inputTopic = "streamsResilienceSource"
    outputTopic = "streamsResilienceSink"
    num_messages = 5

    def __init__(self, test_context):
        super(StreamsBrokerDownResilience,
              self).__init__(test_context=test_context)
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=1,
                                  zk=self.zk,
                                  topics={
                                      self.inputTopic: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      },
                                      self.outputTopic: {
                                          'partitions': 1,
                                          'replication-factor': 1
                                      }
                                  })

    def get_consumer(self):
        return VerifiableConsumer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.outputTopic,
                                  "stream-broker-resilience-verify-consumer",
                                  max_messages=self.num_messages)

    def get_producer(self):
        return VerifiableProducer(self.test_context,
                                  1,
                                  self.kafka,
                                  self.inputTopic,
                                  max_messages=self.num_messages,
                                  acks=1)

    def assert_produce_consume(self, test_state):
        producer = self.get_producer()
        producer.start()

        wait_until(lambda: producer.num_acked > 0,
                   timeout_sec=30,
                   err_msg="At %s failed to send messages " % test_state)

        consumer = self.get_consumer()
        consumer.start()

        wait_until(
            lambda: consumer.total_consumed() > 0,
            timeout_sec=120,
            err_msg="At %s streams did not process messages in 120 seconds " %
            test_state)

    def setUp(self):
        self.zk.start()

    def test_streams_resilient_to_broker_down(self):
        self.kafka.start()

        # Consumer max.poll.interval > min(max.block.ms, ((retries + 1) * request.timeout)
        consumer_poll_ms = "consumer.max.poll.interval.ms=50000"
        retries_config = "producer.retries=2"
        request_timeout = "producer.request.timeout.ms=15000"
        max_block_ms = "producer.max.block.ms=30000"

        # Broker should be down over 2x of retries * timeout ms
        # So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds
        broker_down_time_in_seconds = 70

        # java code expects configs in key=value,key=value format
        updated_configs = consumer_poll_ms + "," + retries_config + "," + request_timeout + "," + max_block_ms

        processor = StreamsBrokerDownResilienceService(self.test_context,
                                                       self.kafka,
                                                       updated_configs)
        processor.start()

        # until KIP-91 is merged we'll only send 5 messages to assert Kafka Streams is running before taking the broker down
        # After KIP-91 is merged we'll continue to send messages the duration of the test
        self.assert_produce_consume("before_broker_stop")

        node = self.kafka.leader(self.inputTopic)

        self.kafka.stop_node(node)

        time.sleep(broker_down_time_in_seconds)

        self.kafka.start_node(node)

        self.assert_produce_consume("after_broker_stop")

        self.kafka.stop()
class ZookeeperTlsTest(ProduceConsumeValidateTest):
    """Tests TLS connectivity to zookeeper.
    """
    def __init__(self, test_context):
        super(ZookeeperTlsTest, self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.group = "group"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1

        self.zk = ZookeeperService(self.test_context, num_nodes=3)

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": 3,
                                          "replication-factor": 3,
                                          'configs': {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int)

        self.consumer.group_id = self.group

    def perform_produce_consume_validation(self):
        self.create_producer_and_consumer()
        self.run_produce_consume_validate()
        self.producer.free()
        self.consumer.free()

    def enable_zk_tls(self):
        self.test_context.logger.debug(
            "Enabling the TLS port in Zookeeper (we won't use it from Kafka yet)"
        )
        # change zk config (enable TLS, but also keep non-TLS)
        self.zk.zk_client_secure_port = True
        self.zk.restart_cluster()
        # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check
        self.kafka.stop_node(self.kafka.nodes[0])
        self.kafka.start_node(self.kafka.nodes[0])

    def enable_kafka_zk_tls(self):
        self.test_context.logger.debug(
            "Configuring Kafka to use the TLS port in Zookeeper")
        # change Kafka config (enable TLS to Zookeeper) and restart the Kafka cluster
        self.kafka.zk_client_secure = True
        self.kafka.restart_cluster()

    def disable_zk_non_tls(self):
        self.test_context.logger.debug(
            "Disabling the non-TLS port in Zookeeper (as a simple sanity check)"
        )
        # change zk config (disable non-TLS, keep TLS) and restart the ZooKeeper cluster
        self.zk.zk_client_port = False
        self.zk.restart_cluster()
        # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check
        self.kafka.stop_node(self.kafka.nodes[0])
        self.kafka.start_node(self.kafka.nodes[0])

    @cluster(num_nodes=9)
    def test_zk_tls(self):
        self.zk.start()
        self.kafka.security_protocol = self.kafka.interbroker_security_protocol = "PLAINTEXT"

        self.kafka.start()

        # Enable TLS port in Zookeeper in addition to the regular non-TLS port
        # Bounces the ZooKeeper cluster (and a single broker as a sanity check)
        self.enable_zk_tls()

        # Leverage ZooKeeper TLS port in Kafka
        # Bounces the Kafka cluster
        self.enable_kafka_zk_tls()
        self.perform_produce_consume_validation()

        # Disable ZooKeeper non-TLS port to make sure we aren't using it
        # Bounces the ZooKeeper cluster (and a single broker as a sanity check)
        self.disable_zk_non_tls()

        # Make sure the ZooKeeper command line is able to talk to a TLS-enabled ZooKeeper quorum
        # Test both create() and query(), each of which leverages the ZooKeeper command line
        # This tests the code in org.apache.zookeeper.ZooKeeperMainWithTlsSupportForKafka
        path = "/foo"
        value = "{\"bar\": 0}"
        self.zk.create(path, value=value)
        if self.zk.query(path) != value:
            raise Exception(
                "Error creating and then querying a znode using the CLI with a TLS-enabled ZooKeeper quorum"
            )

        # Make sure the ConfigCommand CLI is able to talk to a TLS-enabled ZooKeeper quorum
        # This is necessary for the bootstrap use case despite direct ZooKeeper connectivity being deprecated
        self.zk.describe(self.topic)

        # Make sure the AclCommand CLI is able to talk to a TLS-enabled ZooKeeper quorum
        # This is necessary for the bootstrap use case despite direct ZooKeeper connectivity being deprecated
        self.zk.list_acls(self.topic)

        #
        # Test zookeeper.set.acl with just TLS mutual authentication (no SASL)
        #
        # Step 1: run migration tool
        self.zk.zookeeper_migration(self.zk.nodes[0], "secure")
        # Step 2: restart brokers with zookeeper.set.acl=true and acls (with TLS but no SASL)
        self.kafka.zk_set_acl = True
        self.kafka.restart_cluster()
        self.perform_produce_consume_validation()

        #
        # Test zookeeper.set.acl with both SASL and TLS mutual authentication
        #
        # Step 1: remove ACLs created previously
        self.kafka.zk_set_acl = False
        self.kafka.restart_cluster()
        self.zk.zookeeper_migration(self.zk.nodes[0], "unsecure")
        # Step 2: enable ZooKeeper SASL authentication, but don't take advantage of it in Kafka yet
        self.zk.zk_sasl = True
        self.kafka.start_minikdc_if_necessary(self.zk.zk_principals)
        self.zk.restart_cluster()
        # bounce a Kafka broker -- allows us to detect a broker restart failure as a simple sanity check
        self.kafka.stop_node(self.kafka.nodes[0])
        self.kafka.start_node(self.kafka.nodes[0])
        # Step 3: run migration tool
        self.zk.zookeeper_migration(self.zk.nodes[0], "secure")
        # Step 4: restart brokers with zookeeper.set.acl=true and acls (with both TLS and SASL)
        self.kafka.zk_set_acl = True
        self.kafka.restart_cluster()
        self.perform_produce_consume_validation()
Beispiel #10
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk)

    def setUp(self):
        self.zk.start()

    def seed_messages(self, topic, num_seed_messages):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=topic,
                                           message_validator=is_int,
                                           max_messages=num_seed_messages,
                                           enable_idempotence=True)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_topic(self, topic, num_messages):
        consumer = self.start_consumer(topic, group_id="verifying_consumer")
        return self.drain_consumer(consumer, num_messages)

    def bounce_brokers(self, clean_shutdown):
       for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown = True)
            else:
                self.kafka.stop_node(node, clean_shutdown = False)
                wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self.kafka.is_registered(node),
                           timeout_sec=self.kafka.zk_session_timeout + 5,
                           err_msg="Failed to see timely deregistration of \
                           hard-killed broker %s" % str(node.account))
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_topic, input_partition, output_topic, transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=input_topic,
            input_partition=input_partition,
            output_topic=output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size
        )
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=30,
                           err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                           % (copier.transactional_id, str(copier.progress_percent())))
                self.logger.info("%s - progress: %s" % (copier.transactional_id,
                                                        str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self, input_topic, output_topic, num_copiers):
        copiers = []
        for i in range(0, num_copiers):
            copiers.append(self.create_and_start_message_copier(
                input_topic=input_topic,
                output_topic=output_topic,
                input_partition=i,
                transactional_id="copier-" + str(i)
            ))
        return copiers

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=topic_to_read,
                                   group_id=group_id,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" %\
                   60)
        return consumer

    def drain_consumer(self, consumer, num_messages):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" %\
                   (len(consumer.messages_consumed[1]), num_messages, 90))
        consumer.stop()
        return consumer.messages_consumed[1]

    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic,
                                      num_copiers, num_messages_to_copy):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(input_topic=input_topic,
                                                output_topic=output_topic,
                                                num_copiers=num_copiers)
        concurrent_consumer = self.start_consumer(output_topic,
                                                  group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=120,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 120))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)

    def setup_topics(self):
        self.kafka.topics = {
            self.input_topic: {
                "partitions": self.num_input_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            },
            self.output_topic: {
                "partitions": self.num_output_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
        }

    @cluster(num_nodes=9)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"],
            check_order=[True, False])
    def test_transactions(self, failure_mode, bounce_target, check_order):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data_1"]["collect_default"] = True
        self.kafka.logs["kafka_data_2"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"]["collect_default"] = True
        if check_order:
            # To check ordering, we simply create input and output topics
            # with a single partition.
            # We reduce the number of seed messages to copy to account for the fewer output
            # partitions, and thus lower parallelism. This helps keep the test
            # time shorter.
            self.num_seed_messages = self.num_seed_messages / 3
            self.num_input_partitions = 1
            self.num_output_partitions = 1

        self.setup_topics()
        self.kafka.start()

        input_messages = self.seed_messages(self.input_topic, self.num_seed_messages)
        concurrently_consumed_messages = self.copy_messages_transactionally(
            failure_mode, bounce_target, input_topic=self.input_topic,
            output_topic=self.output_topic, num_copiers=self.num_input_partitions,
            num_messages_to_copy=self.num_seed_messages)
        output_messages = self.get_messages_from_topic(self.output_topic, self.num_seed_messages)

        concurrently_consumed_message_set = set(concurrently_consumed_messages)
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)

        num_dups = abs(len(output_messages) - len(output_message_set))
        num_dups_in_concurrent_consumer = abs(len(concurrently_consumed_messages)
                                              - len(concurrently_consumed_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))

        assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
        assert input_message_set == concurrently_consumed_message_set, \
            "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\
            (len(input_message_set), len(concurrently_consumed_message_set))
        if check_order:
            assert input_messages == sorted(input_messages), "The seed messages themselves were not in order"
            assert output_messages == input_messages, "Output messages are not in order"
            assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
Beispiel #11
0
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest):
    """Tests a rolling upgrade from PLAINTEXT to a secured cluster
    """
    def __init__(self, test_context):
        super(TestSecurityRollingUpgrade,
              self).__init__(test_context=test_context)

    def setUp(self):
        self.acls = ACLs(self.test_context)
        self.topic = "test_topic"
        self.group = "group"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": 3,
                                          "replication-factor": 3,
                                          'configs': {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })
        self.zk.start()

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int)

        self.consumer.group_id = "group"

    def bounce(self):
        self.kafka.start_minikdc()
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)
            time.sleep(10)

    def roll_in_secured_settings(self, client_protocol, broker_protocol):
        # Roll cluster to include inter broker security protocol.
        self.kafka.setup_interbroker_listener(broker_protocol)
        self.bounce()

        # Roll cluster to disable PLAINTEXT port
        self.kafka.close_port(SecurityConfig.PLAINTEXT)
        self.set_authorizer_and_bounce(client_protocol, broker_protocol)

    def set_authorizer_and_bounce(self, client_protocol, broker_protocol):
        self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER
        self.acls.set_acls(client_protocol, self.kafka, self.topic, self.group)
        self.acls.set_acls(broker_protocol, self.kafka, self.topic, self.group)
        self.bounce()

    def open_secured_port(self, client_protocol):
        self.kafka.security_protocol = client_protocol
        self.kafka.open_port(client_protocol)
        self.kafka.start_minikdc()
        self.bounce()

    def add_sasl_mechanism(self, new_client_sasl_mechanism):
        self.kafka.client_sasl_mechanism = new_client_sasl_mechanism
        self.kafka.start_minikdc()
        self.bounce()

    def roll_in_sasl_mechanism(self, security_protocol, new_sasl_mechanism):
        # Roll cluster to update inter-broker SASL mechanism. This disables the old mechanism.
        self.kafka.interbroker_sasl_mechanism = new_sasl_mechanism
        self.bounce()

        # Bounce again with ACLs for new mechanism
        self.set_authorizer_and_bounce(security_protocol, security_protocol)

    def add_separate_broker_listener(self, broker_security_protocol,
                                     broker_sasl_mechanism):
        self.kafka.setup_interbroker_listener(broker_security_protocol, True)
        self.kafka.interbroker_sasl_mechanism = broker_sasl_mechanism
        # kafka opens interbroker port automatically in start() but not in bounce()
        self.kafka.open_port(self.kafka.INTERBROKER_LISTENER_NAME)
        self.bounce()

    def remove_separate_broker_listener(self, client_security_protocol,
                                        client_sasl_mechanism):
        # separate interbroker listener port will be closed automatically in setup_interbroker_listener
        # if not using separate interbroker listener
        self.kafka.setup_interbroker_listener(client_security_protocol, False)
        self.kafka.interbroker_sasl_mechanism = client_sasl_mechanism
        self.bounce()

    @cluster(num_nodes=8)
    @matrix(client_protocol=[SecurityConfig.SSL])
    @cluster(num_nodes=9)
    @matrix(client_protocol=[
        SecurityConfig.SASL_PLAINTEXT, SecurityConfig.SASL_SSL
    ])
    def test_rolling_upgrade_phase_one(self, client_protocol):
        """
        Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce
        and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port.
        """
        self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT)
        self.kafka.security_protocol = SecurityConfig.PLAINTEXT
        self.kafka.start()

        # Create PLAINTEXT producer and consumer
        self.create_producer_and_consumer()

        # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run
        self.run_produce_consume_validate(self.open_secured_port,
                                          client_protocol)

        # Now we can produce and consume via the secured port
        self.kafka.security_protocol = client_protocol
        self.create_producer_and_consumer()
        self.run_produce_consume_validate(lambda: time.sleep(1))

    @cluster(num_nodes=8)
    @matrix(client_protocol=[
        SecurityConfig.SASL_SSL, SecurityConfig.SSL,
        SecurityConfig.SASL_PLAINTEXT
    ],
            broker_protocol=[
                SecurityConfig.SASL_SSL, SecurityConfig.SSL,
                SecurityConfig.SASL_PLAINTEXT
            ])
    def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol):
        """
        Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one).
        A third secure port is also open if inter-broker and client protocols are different.
        Start a Producer and Consumer via the SECURED client port
        Incrementally upgrade to add inter-broker be the secure broker protocol
        Incrementally upgrade again to add ACLs as well as disabling the PLAINTEXT port
        Ensure the producer and consumer ran throughout
        """
        #Given we have a broker that has both secure and PLAINTEXT ports open
        self.kafka.security_protocol = client_protocol
        self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT,
                                              use_separate_listener=False)
        self.kafka.open_port(broker_protocol)
        self.kafka.start()

        #Create Secured Producer and Consumer
        self.create_producer_and_consumer()

        #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout
        self.run_produce_consume_validate(self.roll_in_secured_settings,
                                          client_protocol, broker_protocol)

    @cluster(num_nodes=9)
    @matrix(new_client_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN])
    def test_rolling_upgrade_sasl_mechanism_phase_one(
            self, new_client_sasl_mechanism):
        """
        Start with a SASL/GSSAPI cluster, add new SASL mechanism, via a rolling upgrade, ensuring we could produce
        and consume throughout over SASL/GSSAPI. Finally check we can produce and consume using new mechanism.
        """
        self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL,
                                              use_separate_listener=False)
        self.kafka.security_protocol = SecurityConfig.SASL_SSL
        self.kafka.client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI
        self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI
        self.kafka.start()

        # Create SASL/GSSAPI producer and consumer
        self.create_producer_and_consumer()

        # Rolling upgrade, adding new SASL mechanism, ensuring the GSSAPI producer/consumer continues to run
        self.run_produce_consume_validate(self.add_sasl_mechanism,
                                          new_client_sasl_mechanism)

        # Now we can produce and consume using the new SASL mechanism
        self.kafka.client_sasl_mechanism = new_client_sasl_mechanism
        self.create_producer_and_consumer()
        self.run_produce_consume_validate(lambda: time.sleep(1))

    @cluster(num_nodes=8)
    @matrix(new_sasl_mechanism=[SecurityConfig.SASL_MECHANISM_PLAIN])
    def test_rolling_upgrade_sasl_mechanism_phase_two(self,
                                                      new_sasl_mechanism):
        """
        Start with a SASL cluster with GSSAPI for inter-broker and a second mechanism for clients (i.e. result of phase one).
        Start Producer and Consumer using the second mechanism
        Incrementally upgrade to set inter-broker to the second mechanism and disable GSSAPI
        Incrementally upgrade again to add ACLs
        Ensure the producer and consumer run throughout
        """
        #Start with a broker that has GSSAPI for inter-broker and a second mechanism for clients
        self.kafka.security_protocol = SecurityConfig.SASL_SSL
        self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL,
                                              use_separate_listener=False)
        self.kafka.client_sasl_mechanism = new_sasl_mechanism
        self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI
        self.kafka.start()

        #Create Producer and Consumer using second mechanism
        self.create_producer_and_consumer()

        #Roll in the second SASL mechanism for inter-broker, disabling first mechanism. Ensure we can produce and consume throughout
        self.run_produce_consume_validate(self.roll_in_sasl_mechanism,
                                          self.kafka.security_protocol,
                                          new_sasl_mechanism)

    @cluster(num_nodes=9)
    def test_enable_separate_interbroker_listener(self):
        """
        Start with a cluster that has a single PLAINTEXT listener.
        Start producing/consuming on PLAINTEXT port.
        While doing that, do a rolling restart to enable separate secured interbroker port
        """
        self.kafka.security_protocol = SecurityConfig.PLAINTEXT
        self.kafka.setup_interbroker_listener(SecurityConfig.PLAINTEXT,
                                              use_separate_listener=False)

        self.kafka.start()

        self.create_producer_and_consumer()

        self.run_produce_consume_validate(self.add_separate_broker_listener,
                                          SecurityConfig.SASL_SSL,
                                          SecurityConfig.SASL_MECHANISM_PLAIN)

    @cluster(num_nodes=9)
    def test_disable_separate_interbroker_listener(self):
        """
        Start with a cluster that has two listeners, one on SSL (clients), another on SASL_SSL (broker-to-broker).
        Start producer and consumer on SSL listener.
        Close dedicated interbroker listener via rolling restart.
        Ensure we can produce and consume via SSL listener throughout.
        """
        client_protocol = SecurityConfig.SSL
        client_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI

        self.kafka.security_protocol = client_protocol
        self.kafka.client_sasl_mechanism = client_sasl_mechanism
        self.kafka.setup_interbroker_listener(SecurityConfig.SASL_SSL,
                                              use_separate_listener=True)
        self.kafka.interbroker_sasl_mechanism = SecurityConfig.SASL_MECHANISM_GSSAPI

        self.kafka.start()
        # create producer and consumer via client security protocol
        self.create_producer_and_consumer()

        # run produce/consume/validate loop while disabling a separate interbroker listener via rolling restart
        self.run_produce_consume_validate(self.remove_separate_broker_listener,
                                          client_protocol,
                                          client_sasl_mechanism)
class ZooKeeperSecurityUpgradeTest(ProduceConsumeValidateTest):
    """Tests a rolling upgrade for zookeeper.
    """

    def __init__(self, test_context):
        super(ZooKeeperSecurityUpgradeTest, self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.group = "group"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1
        self.acls = ACLs()

        self.zk = ZookeeperService(self.test_context, num_nodes=3)

        self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, topics={self.topic: {
            "partitions": 3,
            "replication-factor": 3,
            'configs': {"min.insync.replicas": 2}}})

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(
            self.test_context, self.num_producers, self.kafka, self.topic,
            throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(
            self.test_context, self.num_consumers, self.kafka, self.topic,
            consumer_timeout_ms=60000, message_validator=is_int, new_consumer=True)

        self.consumer.group_id = self.group

    @property
    def no_sasl(self):
        return self.kafka.security_protocol == "PLAINTEXT" or self.kafka.security_protocol == "SSL"

    @property
    def is_secure(self):
        return self.kafka.security_protocol == "SASL_PLAINTEXT" \
               or self.kafka.security_protocol == "SSL" \
               or self.kafka.security_protocol == "SASL_SSL"

    def run_zk_migration(self):
        # change zk config (auth provider + jaas login)
        self.zk.kafka_opts = self.zk.security_system_properties
        self.zk.zk_sasl = True
        if self.no_sasl:
            self.kafka.start_minikdc(self.zk.zk_principals)
        # restart zk
        for node in self.zk.nodes:
            self.zk.stop_node(node)
            self.zk.start_node(node)

        # restart broker with jaas login
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)

        # run migration tool
        for node in self.zk.nodes:
            self.zk.zookeeper_migration(node, "secure")

        # restart broker with zookeeper.set.acl=true and acls
        self.kafka.zk_set_acl = "true"
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)

    @matrix(security_protocol=["PLAINTEXT","SSL","SASL_SSL","SASL_PLAINTEXT"])
    def test_zk_security_upgrade(self, security_protocol):
        self.zk.start()
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol

        # set acls
        if self.is_secure:
            self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER
            self.acls.set_acls(security_protocol, self.kafka, self.zk, self.topic, self.group)

        if(self.no_sasl):
            self.kafka.start()
        else:
            self.kafka.start(self.zk.zk_principals)

        #Create Producer and Consumer
        self.create_producer_and_consumer()

        #Run upgrade
        self.run_produce_consume_validate(self.run_zk_migration)
Beispiel #13
0
class TestSecurityRollingUpgrade(ProduceConsumeValidateTest):
    """Tests a rolling upgrade from PLAINTEXT to a secured cluster
    """
    def __init__(self, test_context):
        super(TestSecurityRollingUpgrade,
              self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1
        self.zk = ZookeeperService(self.test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": 3,
                                          "replication-factor": 3,
                                          'configs': {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })
        self.zk.start()

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int,
                                        new_consumer=True)

        self.consumer.group_id = "unique-test-group-" + str(random.random())

    def bounce(self):
        self.kafka.start_minikdc()
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)
            time.sleep(10)

    def roll_in_secured_settings(self, client_protocol, broker_protocol):

        # Roll cluster to include inter broker security protocol.
        self.kafka.interbroker_security_protocol = broker_protocol
        self.kafka.open_port(client_protocol)
        self.kafka.open_port(broker_protocol)
        self.bounce()

        # Roll cluster to disable PLAINTEXT port
        self.kafka.close_port('PLAINTEXT')
        self.bounce()

    def open_secured_port(self, client_protocol):
        self.kafka.security_protocol = client_protocol
        self.kafka.open_port(client_protocol)
        self.kafka.start_minikdc()
        self.bounce()

    @matrix(client_protocol=["SSL", "SASL_PLAINTEXT", "SASL_SSL"])
    def test_rolling_upgrade_phase_one(self, client_protocol):
        """
        Start with a PLAINTEXT cluster, open a SECURED port, via a rolling upgrade, ensuring we could produce
        and consume throughout over PLAINTEXT. Finally check we can produce and consume the new secured port.
        """
        self.kafka.interbroker_security_protocol = "PLAINTEXT"
        self.kafka.security_protocol = "PLAINTEXT"
        self.kafka.start()

        # Create PLAINTEXT producer and consumer
        self.create_producer_and_consumer()

        # Rolling upgrade, opening a secure protocol, ensuring the Plaintext producer/consumer continues to run
        self.run_produce_consume_validate(self.open_secured_port,
                                          client_protocol)

        # Now we can produce and consume via the secured port
        self.kafka.security_protocol = client_protocol
        self.create_producer_and_consumer()
        self.run_produce_consume_validate(lambda: time.sleep(1))

    @matrix(client_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"],
            broker_protocol=["SASL_SSL", "SSL", "SASL_PLAINTEXT"])
    def test_rolling_upgrade_phase_two(self, client_protocol, broker_protocol):
        """
        Start with a PLAINTEXT cluster with a second Secured port open (i.e. result of phase one).
        Start an Producer and Consumer via the SECURED port
        Rolling upgrade to add inter-broker be the secure protocol
        Rolling upgrade again to disable PLAINTEXT
        Ensure the producer and consumer ran throughout
        """
        #Given we have a broker that has both secure and PLAINTEXT ports open
        self.kafka.security_protocol = client_protocol
        self.kafka.interbroker_security_protocol = "PLAINTEXT"
        self.kafka.start()

        #Create Secured Producer and Consumer
        self.create_producer_and_consumer()

        #Roll in the security protocol. Disable Plaintext. Ensure we can produce and Consume throughout
        self.run_produce_consume_validate(self.roll_in_secured_settings,
                                          client_protocol, broker_protocol)
Beispiel #14
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 20000
        self.transaction_size = 500
        self.first_transactional_id = "my-first-transactional-id"
        self.second_transactional_id = "my-second-transactional-id"
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  topics={
                                      self.input_topic: {
                                          "partitions":
                                          self.num_input_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      },
                                      self.output_topic: {
                                          "partitions":
                                          self.num_output_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })

    def setUp(self):
        self.zk.start()

    def seed_messages(self):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=self.input_topic,
                                           message_validator=is_int,
                                           max_messages=self.num_seed_messages,
                                           enable_idempotence=True)

        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_output_topic(self):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=self.output_topic,
                                   new_consumer=True,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   consumer_timeout_ms=5000,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: consumer.alive(consumer.nodes[0]) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to start for %ds" %\
                   60)
        # wait until the consumer closes, which will be 5 seconds after
        # receiving the last message.
        wait_until(lambda: consumer.alive(consumer.nodes[0]) == False,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume %d messages in %ds" %\
                   (self.num_seed_messages, 60))
        return consumer.messages_consumed[1]

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self
                           .kafka.is_registered(node),
                           timeout_sec=self.kafka.zk_session_timeout + 5,
                           err_msg="Failed to see timely deregistration of \
                           hard-killed broker %s" % str(node.account))
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_partition,
                                        transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=self.input_topic,
            input_partition=input_partition,
            output_topic=self.output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=30,
                           err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                           % (copier.transactional_id, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self):
        copiers = []
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=0,
                transactional_id=self.first_transactional_id))
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=1,
                transactional_id=self.second_transactional_id))
        return copiers

    def copy_messages_transactionally(self, failure_mode, bounce_target):
        copiers = self.create_and_start_copiers()
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=60,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 60))
        self.logger.info("finished copying messages")

    @cluster(num_nodes=8)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"],
            bounce_target=["brokers", "clients"])
    def test_transactions(self, failure_mode, bounce_target):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.start()
        input_messages = self.seed_messages()
        self.copy_messages_transactionally(failure_mode, bounce_target)
        output_messages = self.get_messages_from_output_topic()
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)
        num_dups = abs(len(output_messages) - len(output_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))
Beispiel #15
0
class ReplicaScaleTest(Test):
    def __init__(self, test_context):
        super(ReplicaScaleTest, self).__init__(test_context=test_context)
        self.test_context = test_context
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(self.test_context, num_nodes=8, zk=self.zk)

    def setUp(self):
        self.zk.start()
        self.kafka.start()

    def teardown(self):
        # Need to increase the timeout due to partition count
        for node in self.kafka.nodes:
            self.kafka.stop_node(node, clean_shutdown=False, timeout_sec=60)
        self.kafka.stop()
        self.zk.stop()

    @cluster(num_nodes=12)
    @parametrize(topic_count=500, partition_count=34, replication_factor=3)
    def test_produce_consume(self, topic_count, partition_count, replication_factor):
        topics_create_start_time = time.time()
        for i in range(topic_count):
            topic = "replicas_produce_consume_%d" % i
            print("Creating topic %s" % topic)  # Force some stdout for Jenkins
            topic_cfg = {
                "topic": topic,
                "partitions": partition_count,
                "replication-factor": replication_factor,
                "configs": {"min.insync.replicas": 2}
            }
            self.kafka.create_topic(topic_cfg)

        topics_create_end_time = time.time()
        self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time))

        producer_workload_service = ProduceBenchWorkloadService(self.test_context, self.kafka)
        consumer_workload_service = ConsumeBenchWorkloadService(self.test_context, self.kafka)
        trogdor = TrogdorService(context=self.test_context,
                                 client_services=[self.kafka, producer_workload_service, consumer_workload_service])
        trogdor.start()

        produce_spec = ProduceBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                producer_workload_service.producer_node,
                                                producer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=10000,
                                                max_messages=3400000,
                                                producer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                inactive_topics={},
                                                active_topics={"replicas_produce_consume_[0-2]": {
                                                    "numPartitions": partition_count, "replicationFactor": replication_factor
                                                }})
        produce_workload = trogdor.create_task("replicas-produce-workload", produce_spec)
        produce_workload.wait_for_done(timeout_sec=600)
        self.logger.info("Completed produce bench")

        consume_spec = ConsumeBenchWorkloadSpec(0, TaskSpec.MAX_DURATION_MS,
                                                consumer_workload_service.consumer_node,
                                                consumer_workload_service.bootstrap_servers,
                                                target_messages_per_sec=10000,
                                                max_messages=3400000,
                                                consumer_conf={},
                                                admin_client_conf={},
                                                common_client_conf={},
                                                active_topics=["replicas_produce_consume_[0-2]"])
        consume_workload = trogdor.create_task("replicas-consume-workload", consume_spec)
        consume_workload.wait_for_done(timeout_sec=600)
        self.logger.info("Completed consume bench")

        trogdor.stop()

    @cluster(num_nodes=12)
    @parametrize(topic_count=500, partition_count=34, replication_factor=3)
    def test_clean_bounce(self, topic_count, partition_count, replication_factor):
        topics_create_start_time = time.time()
        for i in range(topic_count):
            topic = "topic-%04d" % i
            print("Creating topic %s" % topic)  # Force some stdout for Jenkins
            topic_cfg = {
                "topic": topic,
                "partitions": partition_count,
                "replication-factor": replication_factor,
                "configs": {"min.insync.replicas": 2}
            }
            self.kafka.create_topic(topic_cfg)
        topics_create_end_time = time.time()
        self.logger.info("Time to create topics: %d" % (topics_create_end_time - topics_create_start_time))

        restart_times = []
        for node in self.kafka.nodes:
            broker_bounce_start_time = time.time()
            self.kafka.stop_node(node, clean_shutdown=True, timeout_sec=600)
            self.kafka.start_node(node, timeout_sec=600)
            broker_bounce_end_time = time.time()
            restart_times.append(broker_bounce_end_time - broker_bounce_start_time)
            self.logger.info("Time to restart %s: %d" % (node.name, broker_bounce_end_time - broker_bounce_start_time))

        self.logger.info("Restart times: %s" % restart_times)

        delete_start_time = time.time()
        for i in range(topic_count):
            topic = "topic-%04d" % i
            self.logger.info("Deleting topic %s" % topic)
            self.kafka.delete_topic(topic)
        delete_end_time = time.time()
        self.logger.info("Time to delete topics: %d" % (delete_end_time - delete_start_time))
Beispiel #16
0
class ZooKeeperSecurityUpgradeTest(ProduceConsumeValidateTest):
    """Tests a rolling upgrade for zookeeper.
    """
    def __init__(self, test_context):
        super(ZooKeeperSecurityUpgradeTest,
              self).__init__(test_context=test_context)

    def setUp(self):
        self.topic = "test_topic"
        self.group = "group"
        self.producer_throughput = 100
        self.num_producers = 1
        self.num_consumers = 1
        self.acls = ACLs()

        self.zk = ZookeeperService(self.test_context, num_nodes=3)

        self.kafka = KafkaService(self.test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": 3,
                                          "replication-factor": 3,
                                          'configs': {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int,
                                        new_consumer=True)

        self.consumer.group_id = self.group

    @property
    def no_sasl(self):
        return self.kafka.security_protocol == "PLAINTEXT" or self.kafka.security_protocol == "SSL"

    @property
    def is_secure(self):
        return self.kafka.security_protocol == "SASL_PLAINTEXT" \
               or self.kafka.security_protocol == "SSL" \
               or self.kafka.security_protocol == "SASL_SSL"

    def run_zk_migration(self):
        # change zk config (auth provider + jaas login)
        self.zk.kafka_opts = self.zk.security_system_properties
        self.zk.zk_sasl = True
        if self.no_sasl:
            self.kafka.start_minikdc(self.zk.zk_principals)
        # restart zk
        for node in self.zk.nodes:
            self.zk.stop_node(node)
            self.zk.start_node(node)

        # restart broker with jaas login
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)

        # run migration tool
        for node in self.zk.nodes:
            self.zk.zookeeper_migration(node, "secure")

        # restart broker with zookeeper.set.acl=true and acls
        self.kafka.zk_set_acl = "true"
        for node in self.kafka.nodes:
            self.kafka.stop_node(node)
            self.kafka.start_node(node)

    @matrix(
        security_protocol=["PLAINTEXT", "SSL", "SASL_SSL", "SASL_PLAINTEXT"])
    def test_zk_security_upgrade(self, security_protocol):
        self.zk.start()
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol

        # set acls
        if self.is_secure:
            self.kafka.authorizer_class_name = KafkaService.SIMPLE_AUTHORIZER
            self.acls.set_acls(security_protocol, self.kafka, self.zk,
                               self.topic, self.group)

        if (self.no_sasl):
            self.kafka.start()
        else:
            self.kafka.start(self.zk.zk_principals)

        #Create Producer and Consumer
        self.create_producer_and_consumer()

        #Run upgrade
        self.run_produce_consume_validate(self.run_zk_migration)
Beispiel #17
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750

        # The transaction timeout should be lower than the progress timeout, but at
        # least as high as the request timeout (which is 30s by default). When the
        # client is hard-bounced, progress may depend on the previous transaction
        # being aborted. When the broker is hard-bounced, we may have to wait as
        # long as the request timeout to get a `Produce` response and we do not
        # want the coordinator timing out the transaction.
        self.transaction_timeout = 40000
        self.progress_timeout_sec = 60
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context,
                                   num_nodes=1) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  controller_num_nodes_override=1)

    def setUp(self):
        if self.zk:
            self.zk.start()

    def seed_messages(self, topic, num_seed_messages):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=topic,
                                           message_validator=is_int,
                                           max_messages=num_seed_messages,
                                           enable_idempotence=True)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_topic(self, topic, num_messages):
        consumer = self.start_consumer(topic, group_id="verifying_consumer")
        return self.drain_consumer(consumer, num_messages)

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                gracePeriodSecs = 5
                if self.zk:
                    wait_until(
                        lambda: len(self.kafka.pids(
                            node)) == 0 and not self.kafka.is_registered(node),
                        timeout_sec=self.kafka.zk_session_timeout +
                        gracePeriodSecs,
                        err_msg=
                        "Failed to see timely deregistration of hard-killed broker %s"
                        % str(node.account))
                else:
                    brokerSessionTimeoutSecs = 18
                    wait_until(
                        lambda: len(self.kafka.pids(node)) == 0,
                        timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs,
                        err_msg=
                        "Failed to see timely disappearance of process for hard-killed broker %s"
                        % str(node.account))
                    time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs)
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_topic, input_partition,
                                        output_topic, transactional_id,
                                        use_group_metadata):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=input_topic,
            input_partition=input_partition,
            output_topic=output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size,
            transaction_timeout=self.transaction_timeout,
            use_group_metadata=use_group_metadata)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=self.progress_timeout_sec,
                           err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \
                           % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self, input_topic, output_topic, num_copiers,
                                 use_group_metadata):
        copiers = []
        for i in range(0, num_copiers):
            copiers.append(
                self.create_and_start_message_copier(
                    input_topic=input_topic,
                    output_topic=output_topic,
                    input_partition=i,
                    transactional_id="copier-" + str(i),
                    use_group_metadata=use_group_metadata))
        return copiers

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=topic_to_read,
                                   group_id=group_id,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" %\
                   60)
        return consumer

    def drain_consumer(self, consumer, num_messages):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" %\
                   (len(consumer.messages_consumed[1]), num_messages, 90))
        consumer.stop()
        return consumer.messages_consumed[1]

    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic, num_copiers,
                                      num_messages_to_copy,
                                      use_group_metadata):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(
            input_topic=input_topic,
            output_topic=output_topic,
            num_copiers=num_copiers,
            use_group_metadata=use_group_metadata)
        concurrent_consumer = self.start_consumer(
            output_topic, group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        copier_timeout_sec = 120
        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=copier_timeout_sec,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, copier_timeout_sec))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)

    def setup_topics(self):
        self.kafka.topics = {
            self.input_topic: {
                "partitions": self.num_input_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            },
            self.output_topic: {
                "partitions": self.num_output_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
        }

    @cluster(num_nodes=9)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"],
            check_order=[True, False],
            use_group_metadata=[True, False])
    def test_transactions(self,
                          failure_mode,
                          bounce_target,
                          check_order,
                          use_group_metadata,
                          metadata_quorum=quorum.all):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data_1"]["collect_default"] = True
        self.kafka.logs["kafka_data_2"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"][
            "collect_default"] = True
        if check_order:
            # To check ordering, we simply create input and output topics
            # with a single partition.
            # We reduce the number of seed messages to copy to account for the fewer output
            # partitions, and thus lower parallelism. This helps keep the test
            # time shorter.
            self.num_seed_messages = self.num_seed_messages // 3
            self.num_input_partitions = 1
            self.num_output_partitions = 1

        self.setup_topics()
        self.kafka.start()

        input_messages = self.seed_messages(self.input_topic,
                                            self.num_seed_messages)
        concurrently_consumed_messages = self.copy_messages_transactionally(
            failure_mode,
            bounce_target,
            input_topic=self.input_topic,
            output_topic=self.output_topic,
            num_copiers=self.num_input_partitions,
            num_messages_to_copy=self.num_seed_messages,
            use_group_metadata=use_group_metadata)
        output_messages = self.get_messages_from_topic(self.output_topic,
                                                       self.num_seed_messages)

        concurrently_consumed_message_set = set(concurrently_consumed_messages)
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)

        num_dups = abs(len(output_messages) - len(output_message_set))
        num_dups_in_concurrent_consumer = abs(
            len(concurrently_consumed_messages) -
            len(concurrently_consumed_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))

        assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
        assert input_message_set == concurrently_consumed_message_set, \
            "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\
            (len(input_message_set), len(concurrently_consumed_message_set))
        if check_order:
            assert input_messages == sorted(
                input_messages
            ), "The seed messages themselves were not in order"
            assert output_messages == input_messages, "Output messages are not in order"
            assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
class GroupModeTransactionsTest(Test):
    """Essentially testing the same functionality as TransactionsTest by transactionally copying data
    from a source topic to a destination topic and killing the copy process as well as the broker
    randomly through the process. The major difference is that we choose to work as a collaborated
    group with same topic subscription instead of individual copiers.

    In the end we verify that the final output topic contains exactly one committed copy of
    each message from the original producer.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(GroupModeTransactionsTest,
              self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 9
        self.num_output_partitions = 9
        self.num_copiers = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750
        # The transaction timeout should be lower than the progress timeout, but at
        # least as high as the request timeout (which is 30s by default). When the
        # client is hard-bounced, progress may depend on the previous transaction
        # being aborted. When the broker is hard-bounced, we may have to wait as
        # long as the request timeout to get a `Produce` response and we do not
        # want the coordinator timing out the transaction.
        self.transaction_timeout = 40000
        self.progress_timeout_sec = 60
        self.consumer_group = "grouped-transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context,
                                   num_nodes=1) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  controller_num_nodes_override=1)

    def setUp(self):
        if self.zk:
            self.zk.start()

    def seed_messages(self, topic, num_seed_messages):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            topic=topic,
            message_validator=is_int,
            max_messages=num_seed_messages,
            enable_idempotence=True,
            repeating_keys=self.num_input_partitions)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." % \
                           (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked_by_partition

    def get_messages_from_topic(self, topic, num_messages):
        consumer = self.start_consumer(topic, group_id="verifying_consumer")
        return self.drain_consumer(consumer, num_messages)

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                gracePeriodSecs = 5
                if self.zk:
                    wait_until(
                        lambda: len(self.kafka.pids(
                            node)) == 0 and not self.kafka.is_registered(node),
                        timeout_sec=self.kafka.zk_session_timeout +
                        gracePeriodSecs,
                        err_msg=
                        "Failed to see timely deregistration of hard-killed broker %s"
                        % str(node.account))
                else:
                    brokerSessionTimeoutSecs = 18
                    wait_until(
                        lambda: len(self.kafka.pids(node)) == 0,
                        timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs,
                        err_msg=
                        "Failed to see timely disappearance of process for hard-killed broker %s"
                        % str(node.account))
                    time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs)
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_topic, output_topic,
                                        transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=input_topic,
            input_partition=-1,
            output_topic=output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size,
            transaction_timeout=self.transaction_timeout,
            use_group_metadata=True,
            group_mode=True)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown, timeout_sec=240):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=self.progress_timeout_sec,
                           err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \
                                   % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self, input_topic, output_topic, num_copiers):
        copiers = []
        for i in range(0, num_copiers):
            copiers.append(
                self.create_and_start_message_copier(
                    input_topic=input_topic,
                    output_topic=output_topic,
                    transactional_id="copier-" + str(i)))
        return copiers

    @staticmethod
    def valid_value_and_partition(msg):
        """Method used to check whether the given message is a valid tab
        separated value + partition

        return value and partition as a size-two array represented tuple: [value, partition]
        """
        try:
            splitted_msg = msg.split('\t')
            value = int(splitted_msg[1])
            partition = int(splitted_msg[0].split(":")[1])
            return [value, partition]

        except ValueError:
            raise Exception(
                "Unexpected message format (expected a tab separated [value, partition] tuple). Message: %s"
                % (msg))

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            topic=topic_to_read,
            group_id=group_id,
            message_validator=self.valid_value_and_partition,
            from_beginning=True,
            print_partition=True,
            isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" % \
                           60)
        return consumer

    @staticmethod
    def split_by_partition(messages_consumed):
        messages_by_partition = {}

        for msg in messages_consumed:
            partition = msg[1]
            if partition not in messages_by_partition:
                messages_by_partition[partition] = []
            messages_by_partition[partition].append(msg[0])
        return messages_by_partition

    def drain_consumer(self, consumer, num_messages):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" % \
                           (len(consumer.messages_consumed[1]), num_messages, 90))
        consumer.stop()
        return self.split_by_partition(consumer.messages_consumed[1])

    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic, num_copiers,
                                      num_messages_to_copy):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(input_topic=input_topic,
                                                output_topic=output_topic,
                                                num_copiers=num_copiers)
        concurrent_consumer = self.start_consumer(
            output_topic, group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        copier_timeout_sec = 240
        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=copier_timeout_sec,
                       err_msg="%s - Failed to copy all messages in %ds." % \
                               (copier.transactional_id, copier_timeout_sec))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)

    def setup_topics(self):
        self.kafka.topics = {
            self.input_topic: {
                "partitions": self.num_input_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            },
            self.output_topic: {
                "partitions": self.num_output_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
        }

    @cluster(num_nodes=10)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"])
    def test_transactions(self,
                          failure_mode,
                          bounce_target,
                          metadata_quorum=quorum.zk):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data_1"]["collect_default"] = True
        self.kafka.logs["kafka_data_2"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"][
            "collect_default"] = True

        self.setup_topics()
        self.kafka.start()

        input_messages_by_partition = self.seed_messages(
            self.input_topic, self.num_seed_messages)
        concurrently_consumed_message_by_partition = self.copy_messages_transactionally(
            failure_mode,
            bounce_target,
            input_topic=self.input_topic,
            output_topic=self.output_topic,
            num_copiers=self.num_copiers,
            num_messages_to_copy=self.num_seed_messages)
        output_messages_by_partition = self.get_messages_from_topic(
            self.output_topic, self.num_seed_messages)

        assert len(input_messages_by_partition) == \
               len(concurrently_consumed_message_by_partition), "The lengths of partition count doesn't match: " \
                                                                "input partitions count %d, " \
                                                                "concurrently consumed partitions count %d" % \
                                                                (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition))

        assert len(input_messages_by_partition) == \
               len(output_messages_by_partition), "The lengths of partition count doesn't match: " \
                                                  "input partitions count %d, " \
                                                  "output partitions count %d" % \
                                                  (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition))

        for p in range(self.num_input_partitions):
            if p not in input_messages_by_partition:
                continue

            assert p in output_messages_by_partition, "Partition %d not in output messages"
            assert p in concurrently_consumed_message_by_partition, "Partition %d not in concurrently consumed messages"

            output_message_set = set(output_messages_by_partition[p])
            input_message_set = set(input_messages_by_partition[p])

            concurrently_consumed_message_set = set(
                concurrently_consumed_message_by_partition[p])

            num_dups = abs(len(output_messages) - len(output_message_set))
            num_dups_in_concurrent_consumer = abs(
                len(concurrently_consumed_messages) -
                len(concurrently_consumed_message_set))
            assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
            assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" % \
                                                        (len(input_message_set), len(output_message_set))

            assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
            assert input_message_set == concurrently_consumed_message_set, \
                "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" % \
                (len(input_message_set), len(concurrently_consumed_message_set))

            assert input_messages == sorted(
                input_messages
            ), "The seed messages themselves were not in order"
            assert output_messages == input_messages, "Output messages are not in order"
            assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"