Esempio n. 1
0
class SecurityTest(ProduceConsumeValidateTest):
    """
    These tests validate security features.
    """

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(SecurityTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {
                                                                    "partitions": 2,
                                                                    "replication-factor": 1}
                                                                })
        self.num_partitions = 2
        self.timeout_sec = 10000
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    @parametrize(security_protocol='PLAINTEXT', interbroker_security_protocol='SSL')
    @parametrize(security_protocol='SSL', interbroker_security_protocol='PLAINTEXT')
    def test_client_ssl_endpoint_validation_failure(self, security_protocol, interbroker_security_protocol):
        """
        Test that invalid hostname in certificate results in connection failures.
        When security_protocol=SSL, client SSL handshakes are expected to fail due to hostname verification failure.
        When security_protocol=PLAINTEXT and interbroker_security_protocol=SSL, controller connections fail
        with hostname verification failure. Hence clients are expected to fail with LEADER_NOT_AVAILABLE.
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = interbroker_security_protocol
        SecurityConfig.ssl_stores = TestSslStores()

        SecurityConfig.ssl_stores.invalid_hostname = True
        self.kafka.start()
        self.create_producer_and_consumer()
        self.producer.log_level = "TRACE"
        self.producer.start()
        self.consumer.start()
        time.sleep(10)
        assert self.producer.num_acked == 0, "Messages published successfully, endpoint validation did not fail with invalid hostname"
        error = 'SSLHandshakeException' if security_protocol is 'SSL' else 'LEADER_NOT_AVAILABLE'
        for node in self.producer.nodes:
            node.account.ssh("grep %s %s" % (error, self.producer.LOG_FILE))
        for node in self.consumer.nodes:
            node.account.ssh("grep %s %s" % (error, self.consumer.LOG_FILE))

        self.producer.stop()
        self.consumer.stop()
        self.producer.log_level = "INFO"

        SecurityConfig.ssl_stores.invalid_hostname = False
        for node in self.kafka.nodes:
            self.kafka.restart_node(node, clean_shutdown=True)
        self.create_producer_and_consumer()
        self.run_produce_consume_validate()

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput)
        self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=10000, message_validator=is_int)
Esempio n. 2
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750

        # The transaction timeout should be lower than the progress timeout, but at
        # least as high as the request timeout (which is 30s by default). When the
        # client is hard-bounced, progress may depend on the previous transaction
        # being aborted. When the broker is hard-bounced, we may have to wait as
        # long as the request timeout to get a `Produce` response and we do not
        # want the coordinator timing out the transaction.
        self.transaction_timeout = 40000
        self.progress_timeout_sec = 60
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context,
                                   num_nodes=1) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  controller_num_nodes_override=1)

    def setUp(self):
        if self.zk:
            self.zk.start()

    def seed_messages(self, topic, num_seed_messages):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=topic,
                                           message_validator=is_int,
                                           max_messages=num_seed_messages,
                                           enable_idempotence=True)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_topic(self, topic, num_messages):
        consumer = self.start_consumer(topic, group_id="verifying_consumer")
        return self.drain_consumer(consumer, num_messages)

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                gracePeriodSecs = 5
                if self.zk:
                    wait_until(
                        lambda: len(self.kafka.pids(
                            node)) == 0 and not self.kafka.is_registered(node),
                        timeout_sec=self.kafka.zk_session_timeout +
                        gracePeriodSecs,
                        err_msg=
                        "Failed to see timely deregistration of hard-killed broker %s"
                        % str(node.account))
                else:
                    brokerSessionTimeoutSecs = 18
                    wait_until(
                        lambda: len(self.kafka.pids(node)) == 0,
                        timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs,
                        err_msg=
                        "Failed to see timely disappearance of process for hard-killed broker %s"
                        % str(node.account))
                    time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs)
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_topic, input_partition,
                                        output_topic, transactional_id,
                                        use_group_metadata):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=input_topic,
            input_partition=input_partition,
            output_topic=output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size,
            transaction_timeout=self.transaction_timeout,
            use_group_metadata=use_group_metadata)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=self.progress_timeout_sec,
                           err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \
                           % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self, input_topic, output_topic, num_copiers,
                                 use_group_metadata):
        copiers = []
        for i in range(0, num_copiers):
            copiers.append(
                self.create_and_start_message_copier(
                    input_topic=input_topic,
                    output_topic=output_topic,
                    input_partition=i,
                    transactional_id="copier-" + str(i),
                    use_group_metadata=use_group_metadata))
        return copiers

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=topic_to_read,
                                   group_id=group_id,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" %\
                   60)
        return consumer

    def drain_consumer(self, consumer, num_messages):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" %\
                   (len(consumer.messages_consumed[1]), num_messages, 90))
        consumer.stop()
        return consumer.messages_consumed[1]

    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic, num_copiers,
                                      num_messages_to_copy,
                                      use_group_metadata):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(
            input_topic=input_topic,
            output_topic=output_topic,
            num_copiers=num_copiers,
            use_group_metadata=use_group_metadata)
        concurrent_consumer = self.start_consumer(
            output_topic, group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        copier_timeout_sec = 120
        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=copier_timeout_sec,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, copier_timeout_sec))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)

    def setup_topics(self):
        self.kafka.topics = {
            self.input_topic: {
                "partitions": self.num_input_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            },
            self.output_topic: {
                "partitions": self.num_output_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
        }

    @cluster(num_nodes=9)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"],
            check_order=[True, False],
            use_group_metadata=[True, False])
    def test_transactions(self,
                          failure_mode,
                          bounce_target,
                          check_order,
                          use_group_metadata,
                          metadata_quorum=quorum.all):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data_1"]["collect_default"] = True
        self.kafka.logs["kafka_data_2"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"][
            "collect_default"] = True
        if check_order:
            # To check ordering, we simply create input and output topics
            # with a single partition.
            # We reduce the number of seed messages to copy to account for the fewer output
            # partitions, and thus lower parallelism. This helps keep the test
            # time shorter.
            self.num_seed_messages = self.num_seed_messages // 3
            self.num_input_partitions = 1
            self.num_output_partitions = 1

        self.setup_topics()
        self.kafka.start()

        input_messages = self.seed_messages(self.input_topic,
                                            self.num_seed_messages)
        concurrently_consumed_messages = self.copy_messages_transactionally(
            failure_mode,
            bounce_target,
            input_topic=self.input_topic,
            output_topic=self.output_topic,
            num_copiers=self.num_input_partitions,
            num_messages_to_copy=self.num_seed_messages,
            use_group_metadata=use_group_metadata)
        output_messages = self.get_messages_from_topic(self.output_topic,
                                                       self.num_seed_messages)

        concurrently_consumed_message_set = set(concurrently_consumed_messages)
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)

        num_dups = abs(len(output_messages) - len(output_message_set))
        num_dups_in_concurrent_consumer = abs(
            len(concurrently_consumed_messages) -
            len(concurrently_consumed_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))

        assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
        assert input_message_set == concurrently_consumed_message_set, \
            "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\
            (len(input_message_set), len(concurrently_consumed_message_set))
        if check_order:
            assert input_messages == sorted(
                input_messages
            ), "The seed messages themselves were not in order"
            assert output_messages == input_messages, "Output messages are not in order"
            assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
Esempio n. 3
0
class LogDirFailureTest(ProduceConsumeValidateTest):
    """
    Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages
    (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop
    too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose
    ordering guarantees.

    Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked,
    we might exit early if some messages are duplicated (though not an issue here since producer retries==0)

    Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively
    consumed messages. Since we run the producer to completion before running the consumer, this is a reliable
    indicator that nothing is left to consume.
    """

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(LogDirFailureTest, self).__init__(test_context=test_context)

        self.topic1 = "test_topic_1"
        self.topic2 = "test_topic_2"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  topics={
                                      self.topic1: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 1}},
                                      self.topic2: {"partitions": 1, "replication-factor": 3, "configs": {"min.insync.replicas": 2}}
                                  },
                                  # Set log.roll.ms to 3 seconds so that broker will detect disk error sooner when it creates log segment
                                  # Otherwise broker will still be able to read/write the log file even if the log directory is inaccessible.
                                  server_prop_overides=[
                                      [config_property.OFFSETS_TOPIC_NUM_PARTITIONS, "1"],
                                      [config_property.LOG_FLUSH_INTERVAL_MESSAGE, "5"],
                                      [config_property.REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS, "60000"],
                                      [config_property.LOG_ROLL_TIME_MS, "3000"]
                                  ])

        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        """Override this since we're adding services outside of the constructor"""
        return super(LogDirFailureTest, self).min_cluster_size() + self.num_producers * 2 + self.num_consumers * 2

    @cluster(num_nodes=9)
    @matrix(bounce_broker=[False, True], broker_type=["leader", "follower"], security_protocol=["PLAINTEXT"])
    def test_replication_with_disk_failure(self, bounce_broker, security_protocol, broker_type):
        """Replication tests.
        These tests verify that replication provides simple durability guarantees by checking that data acked by
        brokers is still available for consumption in the face of various failure scenarios.

        Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2
               and another topic with partitions=3, replication-factor=3, and min.insync.replicas=1
            - Produce messages in the background
            - Consume messages in the background
            - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9)
            - When done driving failures, stop producing, and finish consuming
            - Validate that every acked message was consumed
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.start()

        try:
            # Initialize producer/consumer for topic2
            self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic2,
                                               throughput=self.producer_throughput)
            self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic2, group_id="test-consumer-group-1",
                                            consumer_timeout_ms=60000, message_validator=is_int)
            self.start_producer_and_consumer()

            # Get a replica of the partition of topic2 and make its log directory offline by changing the log dir's permission.
            # We assume that partition of topic2 is created in the second log directory of respective brokers.
            broker_node = select_node(self, broker_type, self.topic2)
            broker_idx = self.kafka.idx(broker_node)
            assert broker_idx in self.kafka.isr_idx_list(self.topic2), \
                   "Broker %d should be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic2)))

            # Verify that topic1 and the consumer offset topic is in the first log directory and topic2 is in the second log directory
            topic_1_partition_0 = KafkaService.DATA_LOG_DIR_1 + "/test_topic_1-0"
            topic_2_partition_0 = KafkaService.DATA_LOG_DIR_2 + "/test_topic_2-0"
            offset_topic_partition_0 = KafkaService.DATA_LOG_DIR_1 + "/__consumer_offsets-0"
            for path in [topic_1_partition_0, topic_2_partition_0, offset_topic_partition_0]:
                assert path_exists(broker_node, path), "%s should exist" % path

            self.logger.debug("Making log dir %s inaccessible" % (KafkaService.DATA_LOG_DIR_2))
            cmd = "chmod a-w %s -R" % (KafkaService.DATA_LOG_DIR_2)
            broker_node.account.ssh(cmd, allow_fail=False)

            if bounce_broker:
                self.kafka.restart_node(broker_node, clean_shutdown=True)

            # Verify the following:
            # 1) The broker with offline log directory is not the leader of the partition of topic2
            # 2) The broker with offline log directory is not in the ISR
            # 3) The broker with offline log directory is still online
            # 4) Messages can still be produced and consumed from topic2
            wait_until(lambda: self.kafka.leader(self.topic2, partition=0) != broker_node,
                       timeout_sec=60,
                       err_msg="Broker %d should not be leader of topic %s and partition 0" % (broker_idx, self.topic2))
            assert self.kafka.alive(broker_node), "Broker %d should be still online" % (broker_idx)
            wait_until(lambda: broker_idx not in self.kafka.isr_idx_list(self.topic2),
                       timeout_sec=60,
                       err_msg="Broker %d should not be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic2))))

            self.stop_producer_and_consumer()
            self.validate()

            # Shutdown all other brokers so that the broker with offline log dir is the only online broker
            offline_nodes = []
            for node in self.kafka.nodes:
                if broker_node != node:
                    offline_nodes.append(node)
                    self.logger.debug("Hard shutdown broker %d" % (self.kafka.idx(node)))
                    self.kafka.stop_node(node)

            # Verify the following:
            # 1) The broker with offline directory is the only in-sync broker of the partition of topic1
            # 2) Messages can still be produced and consumed from topic1
            self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic1,
                                               throughput=self.producer_throughput, offline_nodes=offline_nodes)
            self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic1, group_id="test-consumer-group-2",
                                            consumer_timeout_ms=90000, message_validator=is_int)
            self.consumer_start_timeout_sec = 90
            self.start_producer_and_consumer()

            assert self.kafka.isr_idx_list(self.topic1) == [broker_idx], \
                   "In-sync replicas of topic %s and partition 0 should be %s" % (self.topic1, str([broker_idx]))

            self.stop_producer_and_consumer()
            self.validate()

        except BaseException as e:
            for s in self.test_context.services:
                self.mark_for_collect(s)
            raise
class LogDirFailureTest(ProduceConsumeValidateTest):
    """
    Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages
    (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop
    too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose
    ordering guarantees.

    Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked,
    we might exit early if some messages are duplicated (though not an issue here since producer retries==0)

    Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively
    consumed messages. Since we run the producer to completion before running the consumer, this is a reliable
    indicator that nothing is left to consume.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(LogDirFailureTest, self).__init__(test_context=test_context)

        self.topic1 = "test_topic_1"
        self.topic2 = "test_topic_2"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(
            test_context,
            num_nodes=3,
            zk=self.zk,
            topics={
                self.topic1: {
                    "partitions": 1,
                    "replication-factor": 3,
                    "configs": {
                        "min.insync.replicas": 2
                    }
                },
                self.topic2: {
                    "partitions": 1,
                    "replication-factor": 3,
                    "configs": {
                        "min.insync.replicas": 1
                    }
                }
            },
            # Set log.roll.ms to 3 seconds so that broker will detect disk error sooner when it creates log segment
            # Otherwise broker will still be able to read/write the log file even if the log directory is inaccessible.
            server_prop_overides=[
                [config_property.LOG_FLUSH_INTERVAL_MESSAGE, "5"],
                [
                    config_property.
                    REPLICA_HIGHWATERMARK_CHECKPOINT_INTERVAL_MS, "60000"
                ], [config_property.LOG_ROLL_TIME_MS, "3000"]
            ])

        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        """Override this since we're adding services outside of the constructor"""
        return super(LogDirFailureTest, self).min_cluster_size(
        ) + self.num_producers * 2 + self.num_consumers * 2

    @cluster(num_nodes=9)
    @matrix(bounce_broker=[False, True],
            broker_type=["leader", "follower"],
            security_protocol=["PLAINTEXT"])
    def test_replication_with_disk_failure(self, bounce_broker,
                                           security_protocol, broker_type):
        """Replication tests.
        These tests verify that replication provides simple durability guarantees by checking that data acked by
        brokers is still available for consumption in the face of various failure scenarios.

        Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2
               and another topic with partitions=3, replication-factor=3, and min.insync.replicas=1
        
            - Produce messages in the background
            - Consume messages in the background
            - Drive broker failures (shutdown, or bounce repeatedly with kill -15 or kill -9)
            - When done driving failures, stop producing, and finish consuming
            - Validate that every acked message was consumed
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.start()

        try:
            # Initialize producer/consumer for topic1
            self.producer = VerifiableProducer(
                self.test_context,
                self.num_producers,
                self.kafka,
                self.topic1,
                throughput=self.producer_throughput)
            self.consumer = ConsoleConsumer(self.test_context,
                                            self.num_consumers,
                                            self.kafka,
                                            self.topic1,
                                            group_id="test-consumer-group-1",
                                            new_consumer=False,
                                            consumer_timeout_ms=60000,
                                            message_validator=is_int)
            self.start_producer_and_consumer()

            # Get a replica of the partition of topic1 and make its first log directory offline by changing the log dir's permission.
            # We assume that partition of topic1 is created in the first log directory of respective brokers.
            broker_node = select_node(self, broker_type, self.topic1)
            broker_idx = self.kafka.idx(broker_node)
            assert broker_idx in self.kafka.isr_idx_list(self.topic1), \
                   "Broker %d should be in isr set %s" % (broker_idx, str(self.kafka.isr_idx_list(self.topic1)))

            self.logger.debug("Making log dir %s inaccessible" %
                              (KafkaService.DATA_LOG_DIR_1))
            cmd = "chmod a-w %s -R" % (KafkaService.DATA_LOG_DIR_1)
            broker_node.account.ssh(cmd, allow_fail=False)

            if bounce_broker:
                self.kafka.restart_node(broker_node, clean_shutdown=True)

            # Verify the following:
            # 1) The broker with offline log directory is not the leader of the partition of topic1
            # 2) The broker with offline log directory is not in the ISR
            # 3) The broker with offline log directory is still online
            # 4) Messages can still be produced and consumed from topic1
            wait_until(
                lambda: self.kafka.leader(self.topic1, partition=0
                                          ) != broker_node,
                timeout_sec=60,
                err_msg=
                "Broker %d should not be leader of topic %s and partition 0" %
                (broker_idx, self.topic1))
            assert self.kafka.alive(
                broker_node), "Broker %d should be still online" % (broker_idx)
            wait_until(
                lambda: broker_idx not in self.kafka.isr_idx_list(self.topic1),
                timeout_sec=60,
                err_msg="Broker %d should not be in isr set %s" %
                (broker_idx, str(self.kafka.isr_idx_list(self.topic1))))

            self.stop_producer_and_consumer()
            self.validate()

            # Shutdown all other brokers so that the broker with offline log dir is the only online broker
            offline_nodes = []
            for node in self.kafka.nodes:
                if broker_node != node:
                    offline_nodes.append(node)
                    self.logger.debug("Hard shutdown broker %d" %
                                      (self.kafka.idx(node)))
                    self.kafka.stop_node(node)

            # Verify the following:
            # 1) The broker with offline directory is the only in-sync broker of the partition of topic2
            # 2) Messages can still be produced and consumed from topic2
            self.producer = VerifiableProducer(
                self.test_context,
                self.num_producers,
                self.kafka,
                self.topic2,
                throughput=self.producer_throughput,
                offline_nodes=offline_nodes)
            self.consumer = ConsoleConsumer(self.test_context,
                                            self.num_consumers,
                                            self.kafka,
                                            self.topic2,
                                            group_id="test-consumer-group-2",
                                            new_consumer=False,
                                            consumer_timeout_ms=60000,
                                            message_validator=is_int)
            self.start_producer_and_consumer()

            assert self.kafka.isr_idx_list(self.topic2) == [broker_idx], \
                   "In-sync replicas of topic %s and partition 0 should be %s" % (self.topic2, str([broker_idx]))

            self.stop_producer_and_consumer()
            self.validate()

        except BaseException as e:
            for s in self.test_context.services:
                self.mark_for_collect(s)
            raise
Esempio n. 5
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750
        self.first_transactional_id = "my-first-transactional-id"
        self.second_transactional_id = "my-second-transactional-id"
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  topics={
                                      self.input_topic: {
                                          "partitions":
                                          self.num_input_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      },
                                      self.output_topic: {
                                          "partitions":
                                          self.num_output_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })

    def setUp(self):
        self.zk.start()

    def seed_messages(self):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=self.input_topic,
                                           message_validator=is_int,
                                           max_messages=self.num_seed_messages,
                                           enable_idempotence=True)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_output_topic(self):
        consumer = self.start_consumer(self.output_topic,
                                       group_id="verifying_consumer")
        return self.drain_consumer(consumer)

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self
                           .kafka.is_registered(node),
                           timeout_sec=self.kafka.zk_session_timeout + 5,
                           err_msg="Failed to see timely deregistration of \
                           hard-killed broker %s" % str(node.account))
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_partition,
                                        transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=self.input_topic,
            input_partition=input_partition,
            output_topic=self.output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=30,
                           err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                           % (copier.transactional_id, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self):
        copiers = []
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=0,
                transactional_id=self.first_transactional_id))
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=1,
                transactional_id=self.second_transactional_id))
        return copiers

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=topic_to_read,
                                   group_id=group_id,
                                   new_consumer=True,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" %\
                   60)
        return consumer

    def drain_consumer(self, consumer):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= self.num_seed_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" %\
                   (len(consumer.messages_consumed[1]), self.num_seed_messages, 90))
        consumer.stop()
        return consumer.messages_consumed[1]

    def copy_messages_transactionally(self, failure_mode, bounce_target):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers()
        concurrent_consumer = self.start_consumer(
            self.output_topic, group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=60,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 60))
        self.logger.info("finished copying messages")
        return self.drain_consumer(concurrent_consumer)

    @cluster(num_nodes=9)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"])
    def test_transactions(self, failure_mode, bounce_target):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"][
            "collect_default"] = True
        self.kafka.start()
        input_messages = self.seed_messages()
        concurrently_consumed_messages = self.copy_messages_transactionally(
            failure_mode, bounce_target)
        output_messages = self.get_messages_from_output_topic()

        concurrently_consumed_message_set = set(concurrently_consumed_messages)
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)

        num_dups = abs(len(output_messages) - len(output_message_set))
        num_dups_in_concurrent_consumer = abs(
            len(concurrently_consumed_messages) -
            len(concurrently_consumed_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))

        assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
        assert input_message_set == concurrently_consumed_message_set, \
            "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\
            (len(input_message_set), len(concurrently_consumed_message_set))
Esempio n. 6
0
class ThrottlingTest(ProduceConsumeValidateTest):
    """Tests throttled partition reassignment. This is essentially similar
    to the reassign_partitions_test, except that we throttle the reassignment
    and verify that it takes a sensible amount of time given the throttle
    and the amount of data being moved.

    Since the correctness is time dependent, this test also simplifies the
    cluster topology. In particular, we fix the number of brokers, the
    replication-factor, the number of partitions, the partition size, and
    the number of partitions being moved so that we can accurately predict
    the time throttled reassignment should take.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ThrottlingTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        # Because we are starting the producer/consumer/validate cycle _after_
        # seeding the cluster with big data (to test throttling), we need to
        # Start the consumer from the end of the stream. further, we need to
        # ensure that the consumer is fully started before the producer starts
        # so that we don't miss any messages. This timeout ensures the sufficient
        # condition.
        self.consumer_init_timeout_sec = 20
        self.num_brokers = 6
        self.num_partitions = 3
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": self.num_partitions,
                                          "replication-factor": 2,
                                          "configs": {
                                              "segment.bytes": 64 * 1024 * 1024
                                          }
                                      }
                                  })
        self.producer_throughput = 1000
        self.timeout_sec = 400
        self.num_records = 2000
        self.record_size = 4096 * 100  # 400 KB
        # 1 MB per partition on average.
        self.partition_size = (self.num_records *
                               self.record_size) / self.num_partitions
        self.num_producers = 2
        self.num_consumers = 1
        self.throttle = 4 * 1024 * 1024  # 4 MB/s

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        # Override this since we're adding services outside of the constructor
        return super(ThrottlingTest, self).min_cluster_size() +\
            self.num_producers + self.num_consumers

    def clean_bounce_some_brokers(self):
        """Bounce every other broker"""
        for node in self.kafka.nodes[::2]:
            self.kafka.restart_node(node, clean_shutdown=True)

    def reassign_partitions(self, bounce_brokers, throttle):
        """This method reassigns partitions using a throttle. It makes an
        assertion about the minimum amount of time the reassignment should take
        given the value of the throttle, the number of partitions being moved,
        and the size of each partition.
        """
        partition_info = self.kafka.parse_describe_topic(
            self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" +
                          str(partition_info))
        max_num_moves = 0
        for i in range(0, self.num_partitions):
            old_replicas = set(partition_info["partitions"][i]["replicas"])
            new_part = (i + 1) % self.num_partitions
            new_replicas = set(
                partition_info["partitions"][new_part]["replicas"])
            max_num_moves = max(len(new_replicas - old_replicas),
                                max_num_moves)
            partition_info["partitions"][i]["partition"] = new_part
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        self.kafka.execute_reassign_partitions(partition_info,
                                               throttle=throttle)
        start = time.time()
        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        size_per_broker = max_num_moves * self.partition_size
        self.logger.debug("Max amount of data transfer per broker: %fb",
                          size_per_broker)
        estimated_throttled_time = math.ceil(
            float(size_per_broker) / self.throttle)
        estimated_time_with_buffer = estimated_throttled_time * 2
        self.logger.debug("Waiting %ds for the reassignment to complete",
                          estimated_time_with_buffer)
        wait_until(
            lambda: self.kafka.verify_reassign_partitions(partition_info),
            timeout_sec=estimated_time_with_buffer,
            backoff_sec=.5)
        stop = time.time()
        time_taken = stop - start
        self.logger.debug("Transfer took %d second. Estimated time : %ds",
                          time_taken, estimated_throttled_time)
        assert time_taken >= estimated_throttled_time, \
            ("Expected rebalance to take at least %ds, but it took %ds" % (
                estimated_throttled_time,
                time_taken))

    @cluster(num_nodes=10)
    @parametrize(bounce_brokers=True)
    @parametrize(bounce_brokers=False)
    def test_throttled_reassignment(self, bounce_brokers):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol

        producer_id = 'bulk_producer'
        bulk_producer = ProducerPerformanceService(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            topic=self.topic,
            num_records=self.num_records,
            record_size=self.record_size,
            throughput=-1,
            client_id=producer_id,
            jmx_object_names=[
                'kafka.producer:type=producer-metrics,client-id=%s' %
                producer_id
            ],
            jmx_attributes=['outgoing-byte-rate'])

        self.producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=self.topic,
                                           message_validator=is_int,
                                           throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int,
                                        from_beginning=False)

        self.kafka.start()
        bulk_producer.run()
        self.run_produce_consume_validate(
            core_test_action=lambda: self.reassign_partitions(
                bounce_brokers, self.throttle))
class GroupModeTransactionsTest(Test):
    """Essentially testing the same functionality as TransactionsTest by transactionally copying data
    from a source topic to a destination topic and killing the copy process as well as the broker
    randomly through the process. The major difference is that we choose to work as a collaborated
    group with same topic subscription instead of individual copiers.

    In the end we verify that the final output topic contains exactly one committed copy of
    each message from the original producer.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(GroupModeTransactionsTest,
              self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 9
        self.num_output_partitions = 9
        self.num_copiers = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750
        # The transaction timeout should be lower than the progress timeout, but at
        # least as high as the request timeout (which is 30s by default). When the
        # client is hard-bounced, progress may depend on the previous transaction
        # being aborted. When the broker is hard-bounced, we may have to wait as
        # long as the request timeout to get a `Produce` response and we do not
        # want the coordinator timing out the transaction.
        self.transaction_timeout = 40000
        self.progress_timeout_sec = 60
        self.consumer_group = "grouped-transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context,
                                   num_nodes=1) if quorum.for_test(
                                       test_context) == quorum.zk else None
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  controller_num_nodes_override=1)

    def setUp(self):
        if self.zk:
            self.zk.start()

    def seed_messages(self, topic, num_seed_messages):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            topic=topic,
            message_validator=is_int,
            max_messages=num_seed_messages,
            enable_idempotence=True,
            repeating_keys=self.num_input_partitions)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." % \
                           (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked_by_partition

    def get_messages_from_topic(self, topic, num_messages):
        consumer = self.start_consumer(topic, group_id="verifying_consumer")
        return self.drain_consumer(consumer, num_messages)

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                gracePeriodSecs = 5
                if self.zk:
                    wait_until(
                        lambda: len(self.kafka.pids(
                            node)) == 0 and not self.kafka.is_registered(node),
                        timeout_sec=self.kafka.zk_session_timeout +
                        gracePeriodSecs,
                        err_msg=
                        "Failed to see timely deregistration of hard-killed broker %s"
                        % str(node.account))
                else:
                    brokerSessionTimeoutSecs = 18
                    wait_until(
                        lambda: len(self.kafka.pids(node)) == 0,
                        timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs,
                        err_msg=
                        "Failed to see timely disappearance of process for hard-killed broker %s"
                        % str(node.account))
                    time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs)
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_topic, output_topic,
                                        transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=input_topic,
            input_partition=-1,
            output_topic=output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size,
            transaction_timeout=self.transaction_timeout,
            use_group_metadata=True,
            group_mode=True)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown, timeout_sec=240):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=self.progress_timeout_sec,
                           err_msg="%s : Message copier didn't make enough progress in %ds. Current progress: %s" \
                                   % (copier.transactional_id, self.progress_timeout_sec, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self, input_topic, output_topic, num_copiers):
        copiers = []
        for i in range(0, num_copiers):
            copiers.append(
                self.create_and_start_message_copier(
                    input_topic=input_topic,
                    output_topic=output_topic,
                    transactional_id="copier-" + str(i)))
        return copiers

    @staticmethod
    def valid_value_and_partition(msg):
        """Method used to check whether the given message is a valid tab
        separated value + partition

        return value and partition as a size-two array represented tuple: [value, partition]
        """
        try:
            splitted_msg = msg.split('\t')
            value = int(splitted_msg[1])
            partition = int(splitted_msg[0].split(":")[1])
            return [value, partition]

        except ValueError:
            raise Exception(
                "Unexpected message format (expected a tab separated [value, partition] tuple). Message: %s"
                % (msg))

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            topic=topic_to_read,
            group_id=group_id,
            message_validator=self.valid_value_and_partition,
            from_beginning=True,
            print_partition=True,
            isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" % \
                           60)
        return consumer

    @staticmethod
    def split_by_partition(messages_consumed):
        messages_by_partition = {}

        for msg in messages_consumed:
            partition = msg[1]
            if partition not in messages_by_partition:
                messages_by_partition[partition] = []
            messages_by_partition[partition].append(msg[0])
        return messages_by_partition

    def drain_consumer(self, consumer, num_messages):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" % \
                           (len(consumer.messages_consumed[1]), num_messages, 90))
        consumer.stop()
        return self.split_by_partition(consumer.messages_consumed[1])

    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic, num_copiers,
                                      num_messages_to_copy):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(input_topic=input_topic,
                                                output_topic=output_topic,
                                                num_copiers=num_copiers)
        concurrent_consumer = self.start_consumer(
            output_topic, group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        copier_timeout_sec = 240
        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=copier_timeout_sec,
                       err_msg="%s - Failed to copy all messages in %ds." % \
                               (copier.transactional_id, copier_timeout_sec))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)

    def setup_topics(self):
        self.kafka.topics = {
            self.input_topic: {
                "partitions": self.num_input_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            },
            self.output_topic: {
                "partitions": self.num_output_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
        }

    @cluster(num_nodes=10)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"])
    def test_transactions(self,
                          failure_mode,
                          bounce_target,
                          metadata_quorum=quorum.zk):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data_1"]["collect_default"] = True
        self.kafka.logs["kafka_data_2"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"][
            "collect_default"] = True

        self.setup_topics()
        self.kafka.start()

        input_messages_by_partition = self.seed_messages(
            self.input_topic, self.num_seed_messages)
        concurrently_consumed_message_by_partition = self.copy_messages_transactionally(
            failure_mode,
            bounce_target,
            input_topic=self.input_topic,
            output_topic=self.output_topic,
            num_copiers=self.num_copiers,
            num_messages_to_copy=self.num_seed_messages)
        output_messages_by_partition = self.get_messages_from_topic(
            self.output_topic, self.num_seed_messages)

        assert len(input_messages_by_partition) == \
               len(concurrently_consumed_message_by_partition), "The lengths of partition count doesn't match: " \
                                                                "input partitions count %d, " \
                                                                "concurrently consumed partitions count %d" % \
                                                                (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition))

        assert len(input_messages_by_partition) == \
               len(output_messages_by_partition), "The lengths of partition count doesn't match: " \
                                                  "input partitions count %d, " \
                                                  "output partitions count %d" % \
                                                  (len(input_messages_by_partition), len(concurrently_consumed_message_by_partition))

        for p in range(self.num_input_partitions):
            if p not in input_messages_by_partition:
                continue

            assert p in output_messages_by_partition, "Partition %d not in output messages"
            assert p in concurrently_consumed_message_by_partition, "Partition %d not in concurrently consumed messages"

            output_message_set = set(output_messages_by_partition[p])
            input_message_set = set(input_messages_by_partition[p])

            concurrently_consumed_message_set = set(
                concurrently_consumed_message_by_partition[p])

            num_dups = abs(len(output_messages) - len(output_message_set))
            num_dups_in_concurrent_consumer = abs(
                len(concurrently_consumed_messages) -
                len(concurrently_consumed_message_set))
            assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
            assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" % \
                                                        (len(input_message_set), len(output_message_set))

            assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
            assert input_message_set == concurrently_consumed_message_set, \
                "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" % \
                (len(input_message_set), len(concurrently_consumed_message_set))

            assert input_messages == sorted(
                input_messages
            ), "The seed messages themselves were not in order"
            assert output_messages == input_messages, "Output messages are not in order"
            assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
Esempio n. 8
0
class ReplicationTest(Test):
    """Replication tests.
    These tests verify that replication provides simple durability guarantees by checking that data acked by
    brokers is still available for consumption in the face of various failure scenarios."""
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ReplicationTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=3,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": 3,
                                          "replication-factor": 3,
                                          "min.insync.replicas": 2
                                      }
                                  })
        self.producer_throughput = 10000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()
        self.kafka.start()

    def min_cluster_size(self):
        """Override this since we're adding services outside of the constructor"""
        return super(
            ReplicationTest,
            self).min_cluster_size() + self.num_producers + self.num_consumers

    def run_with_failure(self, failure):
        """This is the top-level test template.

        The steps are:
            Produce messages in the background while driving some failure condition
            When done driving failures, immediately stop producing
            Consume all messages
            Validate that messages acked by brokers were consumed

        Note that consuming is a bit tricky, at least with console consumer. The goal is to consume all messages
        (foreach partition) in the topic. In this case, waiting for the last message may cause the consumer to stop
        too soon since console consumer is consuming multiple partitions from a single thread and therefore we lose
        ordering guarantees.

        Waiting on a count of consumed messages can be unreliable: if we stop consuming when num_consumed == num_acked,
        we might exit early if some messages are duplicated (though not an issue here since producer retries==0)

        Therefore rely here on the consumer.timeout.ms setting which times out on the interval between successively
        consumed messages. Since we run the producer to completion before running the consumer, this is a reliable
        indicator that nothing is left to consume.

        """
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)
        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=3000)

        # Produce in a background thread while driving broker failures
        self.producer.start()
        if not wait_until(lambda: self.producer.num_acked > 5, timeout_sec=5):
            raise RuntimeError(
                "Producer failed to start in a reasonable amount of time.")
        failure()
        self.producer.stop()

        self.acked = self.producer.acked
        self.not_acked = self.producer.not_acked
        self.logger.info("num not acked: %d" % self.producer.num_not_acked)
        self.logger.info("num acked:     %d" % self.producer.num_acked)

        # Consume all messages
        self.consumer.start()
        self.consumer.wait()
        self.consumed = self.consumer.messages_consumed[1]
        self.logger.info("num consumed:  %d" % len(self.consumed))

        # Check produced vs consumed
        success, msg = self.validate()

        if not success:
            self.mark_for_collect(self.producer)

        assert success, msg

    def clean_shutdown(self):
        """Discover leader node for our topic and shut it down cleanly."""
        self.kafka.signal_leader(self.topic, partition=0, sig=signal.SIGTERM)

    def hard_shutdown(self):
        """Discover leader node for our topic and shut it down with a hard kill."""
        self.kafka.signal_leader(self.topic, partition=0, sig=signal.SIGKILL)

    def clean_bounce(self):
        """Chase the leader of one partition and restart it cleanly."""
        for i in range(5):
            prev_leader_node = self.kafka.leader(topic=self.topic, partition=0)
            self.kafka.restart_node(prev_leader_node,
                                    wait_sec=5,
                                    clean_shutdown=True)

    def hard_bounce(self):
        """Chase the leader and restart it cleanly."""
        for i in range(5):
            prev_leader_node = self.kafka.leader(topic=self.topic, partition=0)
            self.kafka.restart_node(prev_leader_node,
                                    wait_sec=5,
                                    clean_shutdown=False)

            # Wait long enough for previous leader to probably be awake again
            time.sleep(6)

    def validate(self):
        """Check that produced messages were consumed."""

        success = True
        msg = ""

        if len(set(self.consumed)) != len(self.consumed):
            # There are duplicates. This is ok, so report it but don't fail the test
            msg += "There are duplicate messages in the log\n"

        if not set(self.consumed).issuperset(set(self.acked)):
            # Every acked message must appear in the logs. I.e. consumed messages must be superset of acked messages.
            acked_minus_consumed = set(self.producer.acked) - set(
                self.consumed)
            success = False
            msg += "At least one acked message did not appear in the consumed messages. acked_minus_consumed: " + str(
                acked_minus_consumed)

        if not success:
            # Collect all the data logs if there was a failure
            self.mark_for_collect(self.kafka)

        return success, msg

    def test_clean_shutdown(self):
        self.run_with_failure(self.clean_shutdown)

    def test_hard_shutdown(self):
        self.run_with_failure(self.hard_shutdown)

    def test_clean_bounce(self):
        self.run_with_failure(self.clean_bounce)

    def test_hard_bounce(self):
        self.run_with_failure(self.hard_bounce)
Esempio n. 9
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 100000
        self.transaction_size = 750
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk)

    def setUp(self):
        self.zk.start()

    def seed_messages(self, topic, num_seed_messages):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=topic,
                                           message_validator=is_int,
                                           max_messages=num_seed_messages,
                                           enable_idempotence=True)
        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_topic(self, topic, num_messages):
        consumer = self.start_consumer(topic, group_id="verifying_consumer")
        return self.drain_consumer(consumer, num_messages)

    def bounce_brokers(self, clean_shutdown):
       for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown = True)
            else:
                self.kafka.stop_node(node, clean_shutdown = False)
                wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self.kafka.is_registered(node),
                           timeout_sec=self.kafka.zk_session_timeout + 5,
                           err_msg="Failed to see timely deregistration of \
                           hard-killed broker %s" % str(node.account))
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_topic, input_partition, output_topic, transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=input_topic,
            input_partition=input_partition,
            output_topic=output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size
        )
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=30,
                           err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                           % (copier.transactional_id, str(copier.progress_percent())))
                self.logger.info("%s - progress: %s" % (copier.transactional_id,
                                                        str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self, input_topic, output_topic, num_copiers):
        copiers = []
        for i in range(0, num_copiers):
            copiers.append(self.create_and_start_message_copier(
                input_topic=input_topic,
                output_topic=output_topic,
                input_partition=i,
                transactional_id="copier-" + str(i)
            ))
        return copiers

    def start_consumer(self, topic_to_read, group_id):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=topic_to_read,
                                   group_id=group_id,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume any messages for %ds" %\
                   60)
        return consumer

    def drain_consumer(self, consumer, num_messages):
        # wait until we read at least the expected number of messages.
        # This is a safe check because both failure modes will be caught:
        #  1. If we have 'num_seed_messages' but there are duplicates, then
        #     this is checked for later.
        #
        #  2. If we never reach 'num_seed_messages', then this will cause the
        #     test to fail.
        wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
                   timeout_sec=90,
                   err_msg="Consumer consumed only %d out of %d messages in %ds" %\
                   (len(consumer.messages_consumed[1]), num_messages, 90))
        consumer.stop()
        return consumer.messages_consumed[1]

    def copy_messages_transactionally(self, failure_mode, bounce_target,
                                      input_topic, output_topic,
                                      num_copiers, num_messages_to_copy):
        """Copies messages transactionally from the seeded input topic to the
        output topic, either bouncing brokers or clients in a hard and soft
        way as it goes.

        This method also consumes messages in read_committed mode from the
        output topic while the bounces and copy is going on.

        It returns the concurrently consumed messages.
        """
        copiers = self.create_and_start_copiers(input_topic=input_topic,
                                                output_topic=output_topic,
                                                num_copiers=num_copiers)
        concurrent_consumer = self.start_consumer(output_topic,
                                                  group_id="concurrent_consumer")
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=120,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 120))
        self.logger.info("finished copying messages")

        return self.drain_consumer(concurrent_consumer, num_messages_to_copy)

    def setup_topics(self):
        self.kafka.topics = {
            self.input_topic: {
                "partitions": self.num_input_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            },
            self.output_topic: {
                "partitions": self.num_output_partitions,
                "replication-factor": 3,
                "configs": {
                    "min.insync.replicas": 2
                }
            }
        }

    @cluster(num_nodes=9)
    @matrix(failure_mode=["hard_bounce", "clean_bounce"],
            bounce_target=["brokers", "clients"],
            check_order=[True, False])
    def test_transactions(self, failure_mode, bounce_target, check_order):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.logs["kafka_data_1"]["collect_default"] = True
        self.kafka.logs["kafka_data_2"]["collect_default"] = True
        self.kafka.logs["kafka_operational_logs_debug"]["collect_default"] = True
        if check_order:
            # To check ordering, we simply create input and output topics
            # with a single partition.
            # We reduce the number of seed messages to copy to account for the fewer output
            # partitions, and thus lower parallelism. This helps keep the test
            # time shorter.
            self.num_seed_messages = self.num_seed_messages / 3
            self.num_input_partitions = 1
            self.num_output_partitions = 1

        self.setup_topics()
        self.kafka.start()

        input_messages = self.seed_messages(self.input_topic, self.num_seed_messages)
        concurrently_consumed_messages = self.copy_messages_transactionally(
            failure_mode, bounce_target, input_topic=self.input_topic,
            output_topic=self.output_topic, num_copiers=self.num_input_partitions,
            num_messages_to_copy=self.num_seed_messages)
        output_messages = self.get_messages_from_topic(self.output_topic, self.num_seed_messages)

        concurrently_consumed_message_set = set(concurrently_consumed_messages)
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)

        num_dups = abs(len(output_messages) - len(output_message_set))
        num_dups_in_concurrent_consumer = abs(len(concurrently_consumed_messages)
                                              - len(concurrently_consumed_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))

        assert num_dups_in_concurrent_consumer == 0, "Detected %d dups in concurrently consumed messages" % num_dups_in_concurrent_consumer
        assert input_message_set == concurrently_consumed_message_set, \
            "Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" %\
            (len(input_message_set), len(concurrently_consumed_message_set))
        if check_order:
            assert input_messages == sorted(input_messages), "The seed messages themselves were not in order"
            assert output_messages == input_messages, "Output messages are not in order"
            assert concurrently_consumed_messages == output_messages, "Concurrently consumed messages are not in order"
Esempio n. 10
0
class SecurityTest(ProduceConsumeValidateTest):
    """
    These tests validate security features.
    """

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(SecurityTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context, num_nodes=1, zk=self.zk, topics={self.topic: {
                                                                    "partitions": 2,
                                                                    "replication-factor": 1}
                                                                })
        self.num_partitions = 2
        self.timeout_sec = 10000
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def producer_consumer_have_expected_error(self, error):
        try:
            for node in self.producer.nodes:
                node.account.ssh("grep %s %s" % (error, self.producer.LOG_FILE))
            for node in self.consumer.nodes:
                node.account.ssh("grep %s %s" % (error, self.consumer.LOG_FILE))
        except RemoteCommandError:
            return False

        return True

    @cluster(num_nodes=7)
    @parametrize(security_protocol='PLAINTEXT', interbroker_security_protocol='SSL')
    @parametrize(security_protocol='SSL', interbroker_security_protocol='PLAINTEXT')
    def test_client_ssl_endpoint_validation_failure(self, security_protocol, interbroker_security_protocol):
        """
        Test that invalid hostname in certificate results in connection failures.
        When security_protocol=SSL, client SSL handshakes are expected to fail due to hostname verification failure.
        When security_protocol=PLAINTEXT and interbroker_security_protocol=SSL, controller connections fail
        with hostname verification failure. Hence clients are expected to fail with LEADER_NOT_AVAILABLE.
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = interbroker_security_protocol
        SecurityConfig.ssl_stores = TestSslStores(self.test_context.local_scratch_dir, valid_hostname=False)

        self.kafka.start()
        self.create_producer_and_consumer()
        self.producer.log_level = "TRACE"

        self.producer.start()
        self.consumer.start()
        try:
            wait_until(lambda: self.producer.num_acked > 0, timeout_sec=5)

            # Fail quickly if messages are successfully acked
            raise RuntimeError("Messages published successfully but should not have!"
                               " Endpoint validation did not fail with invalid hostname")
        except TimeoutError:
            # expected
            pass

        error = 'SSLHandshakeException' if security_protocol == 'SSL' else 'LEADER_NOT_AVAILABLE'
        wait_until(lambda: self.producer_consumer_have_expected_error(error), timeout_sec=5)

        self.producer.stop()
        self.consumer.stop()
        self.producer.log_level = "INFO"

        SecurityConfig.ssl_stores.valid_hostname = True
        for node in self.kafka.nodes:
            self.kafka.restart_node(node, clean_shutdown=True)

        self.create_producer_and_consumer()
        self.run_produce_consume_validate()

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput)
        self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, consumer_timeout_ms=10000, message_validator=is_int)
Esempio n. 11
0
class ReassignPartitionsTest(ProduceConsumeValidateTest):
    """
    These tests validate partition reassignment.
    Create a topic with few partitions, load some data, trigger partition re-assignment with and without broker failure,
    check that partition re-assignment can complete and there is no data loss.
    """

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ReassignPartitionsTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.num_partitions = 20
        self.zk = ZookeeperService(test_context, num_nodes=1)
        # We set the min.insync.replicas to match the replication factor because
        # it makes the test more stringent. If min.isr = 2 and
        # replication.factor=3, then the test would tolerate the failure of
        # reassignment for upto one replica per partition, which is not
        # desirable for this test in particular.
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk,
                                  server_prop_overides=[
                                      [config_property.LOG_ROLL_TIME_MS, "5000"],
                                      [config_property.LOG_RETENTION_CHECK_INTERVAL_MS, "5000"]
                                  ],
                                  topics={self.topic: {
                                      "partitions": self.num_partitions,
                                      "replication-factor": 3,
                                      'configs': {
                                          "min.insync.replicas": 3,
                                      }}
                                  })
        self.timeout_sec = 60
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        # Override this since we're adding services outside of the constructor
        return super(ReassignPartitionsTest, self).min_cluster_size() + self.num_producers + self.num_consumers

    def clean_bounce_some_brokers(self):
        """Bounce every other broker"""
        for node in self.kafka.nodes[::2]:
            self.kafka.restart_node(node, clean_shutdown=True)

    def reassign_partitions(self, bounce_brokers):
        partition_info = self.kafka.parse_describe_topic(self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" + str(partition_info))

        # jumble partition assignment in dictionary
        seed = random.randint(0, 2 ** 31 - 1)
        self.logger.debug("Jumble partition assignment with seed " + str(seed))
        random.seed(seed)
        # The list may still be in order, but that's ok
        shuffled_list = range(0, self.num_partitions)
        random.shuffle(shuffled_list)

        for i in range(0, self.num_partitions):
            partition_info["partitions"][i]["partition"] = shuffled_list[i]
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        # send reassign partitions command
        self.kafka.execute_reassign_partitions(partition_info)

        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        wait_until(lambda: self.kafka.verify_reassign_partitions(partition_info),
                   timeout_sec=self.timeout_sec, backoff_sec=.5)

    def move_start_offset(self):
        """We move the start offset of the topic by writing really old messages
        and waiting for them to be cleaned up.
        """
        producer = VerifiableProducer(self.test_context, 1, self.kafka, self.topic,
                                      throughput=-1, enable_idempotence=True,
                                      create_time=1000)
        producer.start()
        wait_until(lambda: producer.num_acked > 0,
                   timeout_sec=30,
                   err_msg="Failed to get an acknowledgement for %ds" % 30)
        # Wait 8 seconds to let the topic be seeded with messages that will
        # be deleted. The 8 seconds is important, since we should get 2 deleted
        # segments in this period based on the configured log roll time and the
        # retention check interval.
        time.sleep(8)
        producer.stop()
        self.logger.info("Seeded topic with %d messages which will be deleted" %\
                         producer.num_acked)
        # Since the configured check interval is 5 seconds, we wait another
        # 6 seconds to ensure that at least one more cleaning so that the last
        # segment is deleted. An altenate to using timeouts is to poll each
        # partition until the log start offset matches the end offset. The
        # latter is more robust.
        time.sleep(6)

    @cluster(num_nodes=8)
    @matrix(bounce_brokers=[True, False],
            reassign_from_offset_zero=[True, False])
    def test_reassign_partitions(self, bounce_brokers, reassign_from_offset_zero):
        """Reassign partitions tests.
        Setup: 1 zk, 4 kafka nodes, 1 topic with partitions=20, replication-factor=3,
        and min.insync.replicas=3

            - Produce messages in the background
            - Consume messages in the background
            - Reassign partitions
            - If bounce_brokers is True, also bounce a few brokers while partition re-assignment is in progress
            - When done reassigning partitions and bouncing brokers, stop producing, and finish consuming
            - Validate that every acked message was consumed
            """
        self.kafka.start()
        if not reassign_from_offset_zero:
            self.move_start_offset()

        self.producer = VerifiableProducer(self.test_context, self.num_producers,
                                           self.kafka, self.topic,
                                           throughput=self.producer_throughput,
                                           enable_idempotence=True)
        self.consumer = ConsoleConsumer(self.test_context, self.num_consumers,
                                        self.kafka, self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int)

        self.enable_idempotence=True
        self.run_produce_consume_validate(core_test_action=lambda: self.reassign_partitions(bounce_brokers))
Esempio n. 12
0
class ReassignPartitionsTest(ProduceConsumeValidateTest):
    """
    These tests validate partition reassignment.
    Create a topic with few partitions, load some data, trigger partition re-assignment with and without broker failure,
    check that partition re-assignment can complete and there is no data loss.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ReassignPartitionsTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.num_partitions = 20
        self.zk = ZookeeperService(test_context, num_nodes=1)
        # We set the min.insync.replicas to match the replication factor because
        # it makes the test more stringent. If min.isr = 2 and
        # replication.factor=3, then the test would tolerate the failure of
        # reassignment for upto one replica per partition, which is not
        # desirable for this test in particular.
        self.kafka = KafkaService(
            test_context,
            num_nodes=4,
            zk=self.zk,
            server_prop_overides=[[
                config_property.LOG_ROLL_TIME_MS, "5000"
            ], [config_property.LOG_RETENTION_CHECK_INTERVAL_MS, "5000"]],
            topics={
                self.topic: {
                    "partitions": self.num_partitions,
                    "replication-factor": 3,
                    'configs': {
                        "min.insync.replicas": 3,
                    }
                }
            })
        self.timeout_sec = 60
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        # Override this since we're adding services outside of the constructor
        return super(
            ReassignPartitionsTest,
            self).min_cluster_size() + self.num_producers + self.num_consumers

    def clean_bounce_some_brokers(self):
        """Bounce every other broker"""
        for node in self.kafka.nodes[::2]:
            self.kafka.restart_node(node, clean_shutdown=True)

    def reassign_partitions(self, bounce_brokers):
        partition_info = self.kafka.parse_describe_topic(
            self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" +
                          str(partition_info))

        # jumble partition assignment in dictionary
        seed = random.randint(0, 2**31 - 1)
        self.logger.debug("Jumble partition assignment with seed " + str(seed))
        random.seed(seed)
        # The list may still be in order, but that's ok
        shuffled_list = list(range(0, self.num_partitions))
        random.shuffle(shuffled_list)

        for i in range(0, self.num_partitions):
            partition_info["partitions"][i]["partition"] = shuffled_list[i]
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        # send reassign partitions command
        self.kafka.execute_reassign_partitions(partition_info)

        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        wait_until(
            lambda: self.kafka.verify_reassign_partitions(partition_info),
            timeout_sec=self.timeout_sec,
            backoff_sec=.5)

    def move_start_offset(self):
        """We move the start offset of the topic by writing really old messages
        and waiting for them to be cleaned up.
        """
        producer = VerifiableProducer(self.test_context,
                                      1,
                                      self.kafka,
                                      self.topic,
                                      throughput=-1,
                                      enable_idempotence=True,
                                      create_time=1000)
        producer.start()
        wait_until(lambda: producer.num_acked > 0,
                   timeout_sec=30,
                   err_msg="Failed to get an acknowledgement for %ds" % 30)
        # Wait 8 seconds to let the topic be seeded with messages that will
        # be deleted. The 8 seconds is important, since we should get 2 deleted
        # segments in this period based on the configured log roll time and the
        # retention check interval.
        time.sleep(8)
        producer.stop()
        self.logger.info("Seeded topic with %d messages which will be deleted" %\
                         producer.num_acked)
        # Since the configured check interval is 5 seconds, we wait another
        # 6 seconds to ensure that at least one more cleaning so that the last
        # segment is deleted. An altenate to using timeouts is to poll each
        # partition until the log start offset matches the end offset. The
        # latter is more robust.
        time.sleep(6)

    @cluster(num_nodes=8)
    @matrix(bounce_brokers=[True, False],
            reassign_from_offset_zero=[True, False])
    def test_reassign_partitions(self, bounce_brokers,
                                 reassign_from_offset_zero):
        """Reassign partitions tests.
        Setup: 1 zk, 4 kafka nodes, 1 topic with partitions=20, replication-factor=3,
        and min.insync.replicas=3

            - Produce messages in the background
            - Consume messages in the background
            - Reassign partitions
            - If bounce_brokers is True, also bounce a few brokers while partition re-assignment is in progress
            - When done reassigning partitions and bouncing brokers, stop producing, and finish consuming
            - Validate that every acked message was consumed
            """
        self.kafka.start()
        if not reassign_from_offset_zero:
            self.move_start_offset()

        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput,
                                           enable_idempotence=True)
        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int)

        self.enable_idempotence = True
        self.run_produce_consume_validate(
            core_test_action=lambda: self.reassign_partitions(bounce_brokers))
Esempio n. 13
0
class TransactionsTest(Test):
    """Tests transactions by transactionally copying data from a source topic to
    a destination topic and killing the copy process as well as the broker
    randomly through the process. In the end we verify that the final output
    topic contains exactly one committed copy of each message in the input
    topic
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(TransactionsTest, self).__init__(test_context=test_context)

        self.input_topic = "input-topic"
        self.output_topic = "output-topic"

        self.num_brokers = 3

        # Test parameters
        self.num_input_partitions = 2
        self.num_output_partitions = 3
        self.num_seed_messages = 20000
        self.transaction_size = 500
        self.first_transactional_id = "my-first-transactional-id"
        self.second_transactional_id = "my-second-transactional-id"
        self.consumer_group = "transactions-test-consumer-group"

        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  topics={
                                      self.input_topic: {
                                          "partitions":
                                          self.num_input_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      },
                                      self.output_topic: {
                                          "partitions":
                                          self.num_output_partitions,
                                          "replication-factor": 3,
                                          "configs": {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })

    def setUp(self):
        self.zk.start()

    def seed_messages(self):
        seed_timeout_sec = 10000
        seed_producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka,
                                           topic=self.input_topic,
                                           message_validator=is_int,
                                           max_messages=self.num_seed_messages,
                                           enable_idempotence=True)

        seed_producer.start()
        wait_until(lambda: seed_producer.num_acked >= self.num_seed_messages,
                   timeout_sec=seed_timeout_sec,
                   err_msg="Producer failed to produce messages %d in  %ds." %\
                   (self.num_seed_messages, seed_timeout_sec))
        return seed_producer.acked

    def get_messages_from_output_topic(self):
        consumer = ConsoleConsumer(context=self.test_context,
                                   num_nodes=1,
                                   kafka=self.kafka,
                                   topic=self.output_topic,
                                   new_consumer=True,
                                   message_validator=is_int,
                                   from_beginning=True,
                                   consumer_timeout_ms=5000,
                                   isolation_level="read_committed")
        consumer.start()
        # ensure that the consumer is up.
        wait_until(lambda: consumer.alive(consumer.nodes[0]) == True,
                   timeout_sec=60,
                   err_msg="Consumer failed to start for %ds" %\
                   60)
        # wait until the consumer closes, which will be 5 seconds after
        # receiving the last message.
        wait_until(lambda: consumer.alive(consumer.nodes[0]) == False,
                   timeout_sec=60,
                   err_msg="Consumer failed to consume %d messages in %ds" %\
                   (self.num_seed_messages, 60))
        return consumer.messages_consumed[1]

    def bounce_brokers(self, clean_shutdown):
        for node in self.kafka.nodes:
            if clean_shutdown:
                self.kafka.restart_node(node, clean_shutdown=True)
            else:
                self.kafka.stop_node(node, clean_shutdown=False)
                wait_until(lambda: len(self.kafka.pids(node)) == 0 and not self
                           .kafka.is_registered(node),
                           timeout_sec=self.kafka.zk_session_timeout + 5,
                           err_msg="Failed to see timely deregistration of \
                           hard-killed broker %s" % str(node.account))
                self.kafka.start_node(node)

    def create_and_start_message_copier(self, input_partition,
                                        transactional_id):
        message_copier = TransactionalMessageCopier(
            context=self.test_context,
            num_nodes=1,
            kafka=self.kafka,
            transactional_id=transactional_id,
            consumer_group=self.consumer_group,
            input_topic=self.input_topic,
            input_partition=input_partition,
            output_topic=self.output_topic,
            max_messages=-1,
            transaction_size=self.transaction_size)
        message_copier.start()
        wait_until(lambda: message_copier.alive(message_copier.nodes[0]),
                   timeout_sec=10,
                   err_msg="Message copier failed to start after 10 s")
        return message_copier

    def bounce_copiers(self, copiers, clean_shutdown):
        for _ in range(3):
            for copier in copiers:
                wait_until(lambda: copier.progress_percent() >= 20.0,
                           timeout_sec=30,
                           err_msg="%s : Message copier didn't make enough progress in 30s. Current progress: %s" \
                           % (copier.transactional_id, str(copier.progress_percent())))
                self.logger.info(
                    "%s - progress: %s" %
                    (copier.transactional_id, str(copier.progress_percent())))
                copier.restart(clean_shutdown)

    def create_and_start_copiers(self):
        copiers = []
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=0,
                transactional_id=self.first_transactional_id))
        copiers.append(
            self.create_and_start_message_copier(
                input_partition=1,
                transactional_id=self.second_transactional_id))
        return copiers

    def copy_messages_transactionally(self, failure_mode, bounce_target):
        copiers = self.create_and_start_copiers()
        clean_shutdown = False
        if failure_mode == "clean_bounce":
            clean_shutdown = True

        if bounce_target == "brokers":
            self.bounce_brokers(clean_shutdown)
        elif bounce_target == "clients":
            self.bounce_copiers(copiers, clean_shutdown)

        for copier in copiers:
            wait_until(lambda: copier.is_done,
                       timeout_sec=60,
                       err_msg="%s - Failed to copy all messages in  %ds." %\
                       (copier.transactional_id, 60))
        self.logger.info("finished copying messages")

    @cluster(num_nodes=8)
    @matrix(failure_mode=["clean_bounce", "hard_bounce"],
            bounce_target=["brokers", "clients"])
    def test_transactions(self, failure_mode, bounce_target):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        self.kafka.start()
        input_messages = self.seed_messages()
        self.copy_messages_transactionally(failure_mode, bounce_target)
        output_messages = self.get_messages_from_output_topic()
        output_message_set = set(output_messages)
        input_message_set = set(input_messages)
        num_dups = abs(len(output_messages) - len(output_message_set))
        assert num_dups == 0, "Detected %d duplicates in the output stream" % num_dups
        assert input_message_set == output_message_set, "Input and output message sets are not equal. Num input messages %d. Num output messages %d" %\
            (len(input_message_set), len(output_message_set))
class ReassignPartitionsTest(ProduceConsumeValidateTest):
    """
    These tests validate partition reassignment.
    Create a topic with few partitions, load some data, trigger partition re-assignment with and without broker failure,
    check that partition re-assignment can complete and there is no data loss.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ReassignPartitionsTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context,
                                  num_nodes=4,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": 20,
                                          "replication-factor": 3,
                                          'configs': {
                                              "min.insync.replicas": 2
                                          }
                                      }
                                  })
        self.num_partitions = 20
        self.timeout_sec = 60
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        # Override this since we're adding services outside of the constructor
        return super(
            ReassignPartitionsTest,
            self).min_cluster_size() + self.num_producers + self.num_consumers

    def clean_bounce_some_brokers(self):
        """Bounce every other broker"""
        for node in self.kafka.nodes[::2]:
            self.kafka.restart_node(node, clean_shutdown=True)

    def reassign_partitions(self, bounce_brokers):
        partition_info = self.kafka.parse_describe_topic(
            self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" +
                          str(partition_info))

        # jumble partition assignment in dictionary
        seed = random.randint(0, 2**31 - 1)
        self.logger.debug("Jumble partition assignment with seed " + str(seed))
        random.seed(seed)
        # The list may still be in order, but that's ok
        shuffled_list = range(0, self.num_partitions)
        random.shuffle(shuffled_list)

        for i in range(0, self.num_partitions):
            partition_info["partitions"][i]["partition"] = shuffled_list[i]
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        # send reassign partitions command
        self.kafka.execute_reassign_partitions(partition_info)

        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        wait_until(
            lambda: self.kafka.verify_reassign_partitions(partition_info),
            timeout_sec=self.timeout_sec,
            backoff_sec=.5)

    @cluster(num_nodes=7)
    @parametrize(security_protocol="PLAINTEXT", bounce_brokers=True)
    @parametrize(security_protocol="PLAINTEXT", bounce_brokers=False)
    def test_reassign_partitions(self, bounce_brokers, security_protocol):
        """Reassign partitions tests.
        Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2

            - Produce messages in the background
            - Consume messages in the background
            - Reassign partitions
            - If bounce_brokers is True, also bounce a few brokers while partition re-assignment is in progress
            - When done reassigning partitions and bouncing brokers, stop producing, and finish consuming
            - Validate that every acked message was consumed
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        new_consumer = False if self.kafka.security_protocol == "PLAINTEXT" else True
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)
        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        new_consumer=new_consumer,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int)
        self.kafka.start()

        self.run_produce_consume_validate(
            core_test_action=lambda: self.reassign_partitions(bounce_brokers))
Esempio n. 15
0
class ReassignPartitionsTest(ProduceConsumeValidateTest):
    """
    These tests validate partition reassignment.
    Create a topic with few partitions, load some data, trigger partition re-assignment with and without broker failure,
    check that partition re-assignment can complete and there is no data loss.
    """

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ReassignPartitionsTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(test_context, num_nodes=4, zk=self.zk, topics={self.topic: {
                                                                    "partitions": 20,
                                                                    "replication-factor": 3,
                                                                    'configs': {"min.insync.replicas": 2}}
                                                                })
        self.num_partitions = 20
        self.timeout_sec = 60
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        # Override this since we're adding services outside of the constructor
        return super(ReassignPartitionsTest, self).min_cluster_size() + self.num_producers + self.num_consumers

    def clean_bounce_some_brokers(self):
        """Bounce every other broker"""
        for node in self.kafka.nodes[::2]:
            self.kafka.restart_node(node, clean_shutdown=True)

    def reassign_partitions(self, bounce_brokers):
        partition_info = self.kafka.parse_describe_topic(self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" + str(partition_info))

        # jumble partition assignment in dictionary
        seed = random.randint(0, 2 ** 31 - 1)
        self.logger.debug("Jumble partition assignment with seed " + str(seed))
        random.seed(seed)
        # The list may still be in order, but that's ok
        shuffled_list = range(0, self.num_partitions)
        random.shuffle(shuffled_list)

        for i in range(0, self.num_partitions):
            partition_info["partitions"][i]["partition"] = shuffled_list[i]
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        # send reassign partitions command
        self.kafka.execute_reassign_partitions(partition_info)

        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        wait_until(lambda: self.kafka.verify_reassign_partitions(partition_info), timeout_sec=self.timeout_sec, backoff_sec=.5)

    @parametrize(security_protocol="PLAINTEXT", bounce_brokers=True)
    @parametrize(security_protocol="PLAINTEXT", bounce_brokers=False)
    def test_reassign_partitions(self, bounce_brokers, security_protocol):
        """Reassign partitions tests.
        Setup: 1 zk, 3 kafka nodes, 1 topic with partitions=3, replication-factor=3, and min.insync.replicas=2

            - Produce messages in the background
            - Consume messages in the background
            - Reassign partitions
            - If bounce_brokers is True, also bounce a few brokers while partition re-assignment is in progress
            - When done reassigning partitions and bouncing brokers, stop producing, and finish consuming
            - Validate that every acked message was consumed
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol
        new_consumer = False if  self.kafka.security_protocol == "PLAINTEXT" else True
        self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, throughput=self.producer_throughput)
        self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, new_consumer=new_consumer, consumer_timeout_ms=60000, message_validator=is_int)
        self.kafka.start()

        self.run_produce_consume_validate(core_test_action=lambda: self.reassign_partitions(bounce_brokers))
Esempio n. 16
0
class SecurityTest(ProduceConsumeValidateTest):
    """
    These tests validate security features.
    """
    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(SecurityTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        self.kafka = KafkaService(
            test_context,
            num_nodes=1,
            zk=self.zk,
            topics={self.topic: {
                "partitions": 2,
                "replication-factor": 1
            }})
        self.num_partitions = 2
        self.timeout_sec = 10000
        self.producer_throughput = 1000
        self.num_producers = 1
        self.num_consumers = 1

    def setUp(self):
        self.zk.start()

    @parametrize(security_protocol='PLAINTEXT',
                 interbroker_security_protocol='SSL')
    @parametrize(security_protocol='SSL',
                 interbroker_security_protocol='PLAINTEXT')
    def test_client_ssl_endpoint_validation_failure(
            self, security_protocol, interbroker_security_protocol):
        """
        Test that invalid hostname in certificate results in connection failures.
        When security_protocol=SSL, client SSL handshakes are expected to fail due to hostname verification failure.
        When security_protocol=PLAINTEXT and interbroker_security_protocol=SSL, controller connections fail
        with hostname verification failure. Hence clients are expected to fail with LEADER_NOT_AVAILABLE.
        """

        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = interbroker_security_protocol
        SecurityConfig.ssl_stores = TestSslStores()

        SecurityConfig.ssl_stores.invalid_hostname = True
        self.kafka.start()
        self.create_producer_and_consumer()
        self.producer.log_level = "TRACE"
        self.producer.start()
        self.consumer.start()
        time.sleep(10)
        assert self.producer.num_acked == 0, "Messages published successfully, endpoint validation did not fail with invalid hostname"
        error = 'SSLHandshakeException' if security_protocol is 'SSL' else 'LEADER_NOT_AVAILABLE'
        for node in self.producer.nodes:
            node.account.ssh("grep %s %s" % (error, self.producer.LOG_FILE))
        for node in self.consumer.nodes:
            node.account.ssh("grep %s %s" % (error, self.consumer.LOG_FILE))

        self.producer.stop()
        self.consumer.stop()
        self.producer.log_level = "INFO"

        SecurityConfig.ssl_stores.invalid_hostname = False
        for node in self.kafka.nodes:
            self.kafka.restart_node(node, clean_shutdown=True)
        self.create_producer_and_consumer()
        self.run_produce_consume_validate()

    def create_producer_and_consumer(self):
        self.producer = VerifiableProducer(self.test_context,
                                           self.num_producers,
                                           self.kafka,
                                           self.topic,
                                           throughput=self.producer_throughput)
        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=10000,
                                        message_validator=is_int)
Esempio n. 17
0
class ThrottlingTest(ProduceConsumeValidateTest):
    """Tests throttled partition reassignment. This is essentially similar
    to the reassign_partitions_test, except that we throttle the reassignment
    and verify that it takes a sensible amount of time given the throttle
    and the amount of data being moved.

    Since the correctness is time dependent, this test also simplifies the
    cluster topology. In particular, we fix the number of brokers, the
    replication-factor, the number of partitions, the partition size, and
    the number of partitions being moved so that we can accurately predict
    the time throttled reassignment should take.
    """

    def __init__(self, test_context):
        """:type test_context: ducktape.tests.test.TestContext"""
        super(ThrottlingTest, self).__init__(test_context=test_context)

        self.topic = "test_topic"
        self.zk = ZookeeperService(test_context, num_nodes=1)
        # Because we are starting the producer/consumer/validate cycle _after_
        # seeding the cluster with big data (to test throttling), we need to
        # Start the consumer from the end of the stream. further, we need to
        # ensure that the consumer is fully started before the producer starts
        # so that we don't miss any messages. This timeout ensures the sufficient
        # condition.
        self.consumer_init_timeout_sec =  20
        self.num_brokers = 6
        self.num_partitions = 3
        self.kafka = KafkaService(test_context,
                                  num_nodes=self.num_brokers,
                                  zk=self.zk,
                                  topics={
                                      self.topic: {
                                          "partitions": self.num_partitions,
                                          "replication-factor": 2,
                                          "configs": {
                                              "segment.bytes": 64 * 1024 * 1024
                                          }
                                      }
                                  })
        self.producer_throughput = 1000
        self.timeout_sec = 400
        self.num_records = 2000
        self.record_size = 4096 * 100  # 400 KB
        # 1 MB per partition on average.
        self.partition_size = (self.num_records * self.record_size) / self.num_partitions
        self.num_producers = 2
        self.num_consumers = 1
        self.throttle = 4 * 1024 * 1024  # 4 MB/s

    def setUp(self):
        self.zk.start()

    def min_cluster_size(self):
        # Override this since we're adding services outside of the constructor
        return super(ThrottlingTest, self).min_cluster_size() +\
            self.num_producers + self.num_consumers

    def clean_bounce_some_brokers(self):
        """Bounce every other broker"""
        for node in self.kafka.nodes[::2]:
            self.kafka.restart_node(node, clean_shutdown=True)

    def reassign_partitions(self, bounce_brokers, throttle):
        """This method reassigns partitions using a throttle. It makes an
        assertion about the minimum amount of time the reassignment should take
        given the value of the throttle, the number of partitions being moved,
        and the size of each partition.
        """
        partition_info = self.kafka.parse_describe_topic(
            self.kafka.describe_topic(self.topic))
        self.logger.debug("Partitions before reassignment:" +
                          str(partition_info))
        max_num_moves = 0
        for i in range(0, self.num_partitions):
            old_replicas = set(partition_info["partitions"][i]["replicas"])
            new_part = (i+1) % self.num_partitions
            new_replicas = set(partition_info["partitions"][new_part]["replicas"])
            max_num_moves = max(len(new_replicas - old_replicas), max_num_moves)
            partition_info["partitions"][i]["partition"] = new_part
        self.logger.debug("Jumbled partitions: " + str(partition_info))

        self.kafka.execute_reassign_partitions(partition_info,
                                               throttle=throttle)
        start = time.time()
        if bounce_brokers:
            # bounce a few brokers at the same time
            self.clean_bounce_some_brokers()

        # Wait until finished or timeout
        size_per_broker = max_num_moves * self.partition_size
        self.logger.debug("Max amount of data transfer per broker: %fb",
                          size_per_broker)
        estimated_throttled_time = math.ceil(float(size_per_broker) /
                                             self.throttle)
        estimated_time_with_buffer = estimated_throttled_time * 2
        self.logger.debug("Waiting %ds for the reassignment to complete",
                          estimated_time_with_buffer)
        wait_until(lambda: self.kafka.verify_reassign_partitions(partition_info),
                   timeout_sec=estimated_time_with_buffer, backoff_sec=.5)
        stop = time.time()
        time_taken = stop - start
        self.logger.debug("Transfer took %d second. Estimated time : %ds",
                          time_taken,
                          estimated_throttled_time)
        assert time_taken >= estimated_throttled_time * 0.9, \
            ("Expected rebalance to take at least %ds, but it took %ds" % (
                estimated_throttled_time,
                time_taken))

    @cluster(num_nodes=10)
    @parametrize(bounce_brokers=True)
    @parametrize(bounce_brokers=False)
    def test_throttled_reassignment(self, bounce_brokers):
        security_protocol = 'PLAINTEXT'
        self.kafka.security_protocol = security_protocol
        self.kafka.interbroker_security_protocol = security_protocol

        producer_id = 'bulk_producer'
        bulk_producer = ProducerPerformanceService(
            context=self.test_context, num_nodes=1, kafka=self.kafka,
            topic=self.topic, num_records=self.num_records,
            record_size=self.record_size, throughput=-1, client_id=producer_id)


        self.producer = VerifiableProducer(context=self.test_context,
                                           num_nodes=1,
                                           kafka=self.kafka, topic=self.topic,
                                           message_validator=is_int,
                                           throughput=self.producer_throughput)

        self.consumer = ConsoleConsumer(self.test_context,
                                        self.num_consumers,
                                        self.kafka,
                                        self.topic,
                                        consumer_timeout_ms=60000,
                                        message_validator=is_int,
                                        from_beginning=False)

        self.kafka.start()
        bulk_producer.run()
        self.run_produce_consume_validate(core_test_action=
                                          lambda: self.reassign_partitions(bounce_brokers, self.throttle))

        self.logger.debug("Bulk producer outgoing-byte-rates: %s",
                          (metric.value for k, metrics in
                          bulk_producer.metrics(group='producer-metrics', name='outgoing-byte-rate', client_id=producer_id) for
                          metric in metrics)
        )