Exemple #1
0
 def test_self_transfer(self):
     admin = Admin(self.redpanda)
     for topic in self.topics:
         for partition in range(topic.partition_count):
             leader = admin.get_partitions(topic, partition)['leader_id']
             admin.partition_transfer_leadership("kafka", topic, partition,
                                                 leader)
    def test_overlapping_changes(self):
        """
        Check that while a movement is in flight, rules about
        overlapping operations are properly enforced.
        """

        self.start_redpanda(num_nodes=4)
        node_ids = {1, 2, 3, 4}

        # Create topic with enough data that inter-node movement
        # will take a while.
        name = f"movetest"
        spec = TopicSpec(name=name, partition_count=1, replication_factor=3)
        self.client().create_topic(spec)

        # Wait for the partition to have a leader (`rpk produce` errors
        # out if it tries to write data before this)
        def partition_ready():
            return KafkaCat(self.redpanda).get_partition_leader(
                name, 0)[0] is not None

        wait_until(partition_ready, timeout_sec=10, backoff_sec=0.5)

        # Write a substantial amount of data to the topic
        msg_size = 512 * 1024
        write_bytes = 512 * 1024 * 1024
        producer = RpkProducer(self._ctx,
                               self.redpanda,
                               name,
                               msg_size=msg_size,
                               msg_count=int(write_bytes / msg_size))
        t1 = time.time()
        producer.start()

        # This is an absurdly low expected throughput, but necessarily
        # so to run reliably on current test runners, which share an EBS
        # backend among many parallel tests.  10MB/s has been empirically
        # shown to be too high an expectation.
        expect_bps = 1 * 1024 * 1024
        expect_runtime = write_bytes / expect_bps
        producer.wait(timeout_sec=expect_runtime)

        self.logger.info(
            f"Write complete {write_bytes} in {time.time() - t1} seconds")

        # - Admin API redirects writes but not reads.  Because we want synchronous
        #   status after submitting operations, send all operations to the controller
        #   leader.  This is not necessary for operations to work, just to simplify
        #   this test by letting it see synchronous status updates.
        # - Because we will later verify that a 503 is sent in response to
        #   a move request to an in_progress topic, set retry_codes=[] to
        #   disable default retries on 503.
        admin_node = self.redpanda.controller()
        admin = Admin(self.redpanda, default_node=admin_node, retry_codes=[])

        # Start an inter-node move, which should take some time
        # to complete because of recovery network traffic
        assignments = self._get_assignments(admin, name, 0)
        new_node = list(node_ids - set([a['node_id'] for a in assignments]))[0]
        self.logger.info(f"old assignments: {assignments}")
        old_assignments = assignments
        assignments = assignments[1:] + [{'node_id': new_node, 'core': 0}]
        self.logger.info(f"new assignments: {assignments}")
        r = admin.set_partition_replicas(name, 0, assignments)
        r.raise_for_status()
        assert admin.get_partitions(name, 0)['status'] == "in_progress"

        # Another move should fail
        assert admin.get_partitions(name, 0)['status'] == "in_progress"
        try:
            r = admin.set_partition_replicas(name, 0, old_assignments)
        except requests.exceptions.HTTPError as e:
            assert e.response.status_code == 503
        else:
            raise RuntimeError(f"Expected 503 but got {r.status_code}")

        # An update to partition properties should succeed
        # (issue https://github.com/vectorizedio/redpanda/issues/2300)
        rpk = RpkTool(self.redpanda)
        assert admin.get_partitions(name, 0)['status'] == "in_progress"
        rpk.alter_topic_config(name, "retention.ms", "3600000")

        # A deletion should succeed
        assert name in rpk.list_topics()
        assert admin.get_partitions(name, 0)['status'] == "in_progress"
        rpk.delete_topic(name)
        assert name not in rpk.list_topics()
Exemple #3
0
class MaintenanceTest(RedpandaTest):
    topics = (TopicSpec(partition_count=10, replication_factor=3),
              TopicSpec(partition_count=20, replication_factor=3))

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.admin = Admin(self.redpanda)
        self.rpk = RpkTool(self.redpanda)
        self._use_rpk = True

    def _has_leadership_role(self, node):
        """
        Returns true if node is leader for some partition, and false otherwise.
        """
        id = self.redpanda.idx(node)
        partitions = self.admin.get_partitions(node=node)
        has_leadership = False
        for p in partitions:
            if p["leader"] == id:
                self.logger.debug(f"{node.name} has leadership for {p}")
                has_leadership = True
        return has_leadership

    def _in_maintenance_mode(self, node):
        status = self.admin.maintenance_status(node)
        return status["draining"]

    def _in_maintenance_mode_fully(self, node):
        status = self.admin.maintenance_status(node)
        return status["finished"] and not status["errors"] and \
                status["partitions"] > 0

    def _verify_broker_metadata(self, maintenance_enabled, node):
        """
        check if both brokers interfaces in the admin server return
        the same status for maintenance mode. further, check if the
        mode is returning that draining has been enabled/disabled
        """
        node_id = self.redpanda.idx(node)
        broker_target = self.admin.get_broker(node_id)
        broker_filtered = None
        for broker in self.admin.get_brokers():
            if broker['node_id'] == node_id:
                broker_filtered = broker
                break
        # both apis should return the same info
        if broker_filtered is None:
            return False
        status = broker_target['maintenance_status']
        if status != broker_filtered['maintenance_status']:
            return False
        # check status wanted
        if maintenance_enabled:
            return status['draining'] and status['finished']
        else:
            return not status['draining']

    def _verify_maintenance_status(self, node, draining):
        """
        Check that cluster reports maintenance status as expected through
        both rpk status tooling as well as raw admin interface.
        """
        # get status for this node via rpk
        node_id = self.redpanda.idx(node)
        statuses = self.rpk.cluster_maintenance_status()
        self.logger.debug(f"finding node_id {node_id} in rpk "
                          "maintenance status: {statuses}")
        rpk_status = None
        for status in statuses:
            if status.node_id == node_id:
                rpk_status = status
                break
        if rpk_status is None:
            return False

        # get status for this node via admin interface
        admin_status = self.admin.maintenance_status(node)
        self.logger.debug(f"maintenance status from admin for "
                          "{node.name}: {admin_status}")

        # ensure that both agree on expected outcome
        return admin_status["draining"] == rpk_status.draining == draining

    def _enable_maintenance(self, node):
        """
        1. Verifies that node is leader for some partitions
        2. Verifies node is not already in maintenance mode
        3. Requests that node enter maintenance mode (persistent interface)
        4. Verifies node enters maintenance mode
        5. Verifies that node has no leadership role
        6. Verifies that maintenance mode completes

        Note that there is a terminology issue that we need to work on. When we
        say that 'maintenance mode completes' it doesn't mean that the node
        leaves maintenance mode. What we mean is that it has entered maintenance
        mode and all of the work associated with that has completed.
        """
        self.logger.debug(
            f"Checking that node {node.name} has a leadership role")
        wait_until(lambda: self._has_leadership_role(node),
                   timeout_sec=60,
                   backoff_sec=10)

        self.logger.debug(
            f"Checking that node {node.name} is not in maintenance mode")
        wait_until(lambda: self._verify_maintenance_status(node, False),
                   timeout_sec=30,
                   backoff_sec=5)

        self.logger.debug(
            f"Waiting for node {node.name} to enter maintenance mode")
        if self._use_rpk:
            self.rpk.cluster_maintenance_enable(node, wait=True)
            # the node should now report itself in maintenance mode
            assert self._in_maintenance_mode(node), \
                    f"{node.name} not in expected maintenance mode"
        else:
            # when using the low-level admin interface the barrier is
            # implemented using wait_until and query the node directly
            self.admin.maintenance_start(node)
            wait_until(lambda: self._in_maintenance_mode(node),
                       timeout_sec=30,
                       backoff_sec=5)

        def has_drained():
            """
            as we wait for leadership to drain, also print out maintenance mode
            status. this is useful for debugging to detect if maintenance mode
            has been lost or disabled for some unexpected reason.
            """
            status = self.admin.maintenance_status(node)
            self.logger.debug(f"Maintenance status for {node.name}: {status}")
            return not self._has_leadership_role(node),

        self.logger.debug(f"Waiting for node {node.name} leadership to drain")
        wait_until(has_drained, timeout_sec=60, backoff_sec=10)

        self.logger.debug(
            f"Waiting for node {node.name} maintenance mode to complete")
        wait_until(lambda: self._in_maintenance_mode_fully(node),
                   timeout_sec=60,
                   backoff_sec=10)

        self.logger.debug("Verifying expected broker metadata reported "
                          f"for enabled maintenance mode on node {node.name}")
        wait_until(lambda: self._verify_broker_metadata(True, node),
                   timeout_sec=60,
                   backoff_sec=10)

    def _verify_cluster(self, target, target_expect):
        for node in self.redpanda.nodes:
            expect = False if node != target else target_expect
            wait_until(
                lambda: self._verify_maintenance_status(node, expect),
                timeout_sec=30,
                backoff_sec=5,
                err_msg=f"expected {node.name} maintenance mode: {expect}")

    def _maintenance_disable(self, node):
        if self._use_rpk:
            self.rpk.cluster_maintenance_disable(node)
        else:
            self.admin.maintenance_stop(node)

        wait_until(lambda: not self._in_maintenance_mode(node),
                   timeout_sec=30,
                   backoff_sec=5)

        wait_until(lambda: self._has_leadership_role(node),
                   timeout_sec=120,
                   backoff_sec=10)

        self.logger.debug("Verifying expected broker metadata reported "
                          f"for disabled maintenance mode on node {node.name}")
        wait_until(lambda: self._verify_broker_metadata(False, node),
                   timeout_sec=60,
                   backoff_sec=10)

    @cluster(num_nodes=3)
    @matrix(use_rpk=[True, False])
    def test_maintenance(self, use_rpk):
        self._use_rpk = use_rpk
        target = random.choice(self.redpanda.nodes)
        self._enable_maintenance(target)
        self._maintenance_disable(target)

    @cluster(num_nodes=3, log_allow_list=RESTART_LOG_ALLOW_LIST)
    @matrix(use_rpk=[True, False])
    def test_maintenance_sticky(self, use_rpk):
        self._use_rpk = use_rpk
        nodes = random.sample(self.redpanda.nodes, len(self.redpanda.nodes))
        for node in nodes:
            self._enable_maintenance(node)
            self._verify_cluster(node, True)

            self.redpanda.restart_nodes(node)
            self._verify_cluster(node, True)

            self._maintenance_disable(node)
            self._verify_cluster(node, False)

        self.redpanda.restart_nodes(self.redpanda.nodes)
        self._verify_cluster(None, False)

    @cluster(num_nodes=3)
    @matrix(use_rpk=[True, False])
    def test_exclusive_maintenance(self, use_rpk):
        self._use_rpk = use_rpk
        target, other = random.sample(self.redpanda.nodes, k=2)
        assert target is not other
        self._enable_maintenance(target)
        try:
            self._enable_maintenance(other)
        except RpkException as e:
            assert self._use_rpk
            if "invalid state transition" in e.msg and "400" in e.msg:
                return
        except requests.exceptions.HTTPError as e:
            assert not self._use_rpk
            if "invalid state transition" in e.response.text and e.response.status_code == 400:
                return
            raise
        except:
            raise
        else:
            raise Exception("Expected maintenance enable to fail")