Exemple #1
0
    def _failure_injector_loop(self):

        while self.enable_failures:
            f_injector = FailureInjector(self.redpanda)
            f_injector.inject_failure(self._next_failure())

            delay = self.failure_delay_provier()
            self.redpanda.logger.info(
                f"waiting {delay} seconds before next failure")
            time.sleep(delay)
    def test_follower_isolation(self):
        """
        Simplest HA test.  Stop the leader for our partition.  Validate that
        the cluster remains available afterwards, and that the expected
        peer takes over as the new leader.
        """
        # Find which node is the leader
        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]

        self._expect_available()

        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"Initial leader {initial_leader_id} {leader_node.account.hostname}"
        )

        with FailureInjector(self.redpanda) as fi:
            # isolate one of the followers
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.get_node(replicas[1])))

            # expect messages to be produced and consumed without a timeout
            for i in range(0, 128):
                self._ping_pong()
Exemple #3
0
    def test_id_allocator_leader_isolation(self):
        """
        Isolate id allocator leader. This test validates whether the cluster
        is still available when `kafka_internal/id_allocator` leader has been isolated.
        """
        admin = Admin(self.redpanda)
        self._expect_available()
        # Find which node is the leader for id allocator partition
        admin.wait_stable_configuration(namespace='kafka_internal',
                                        topic='id_allocator',
                                        replication=3)
        initial_leader_id = admin.get_partition_leader(
            namespace='kafka_internal', topic='id_allocator', partition=0)
        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"kafka_internal/id_allocator/0 leader: {initial_leader_id}, node: {leader_node.account.hostname}"
        )

        self._expect_available()

        with FailureInjector(self.redpanda) as fi:
            # isolate id_allocator
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.get_node(initial_leader_id)))

            # expect messages to be produced and consumed without a timeout
            connection = self.ping_pong()
            connection.ping_pong(timeout_s=10, retries=10)
            for i in range(0, 127):
                connection.ping_pong()
Exemple #4
0
class ProcessKill(DisruptiveAction):
    PROCESS_START_WAIT_SEC = 20
    PROCESS_START_WAIT_BACKOFF = 2

    def __init__(self, redpanda: RedpandaService, config: ActionConfig,
                 admin: Admin):
        super(ProcessKill, self).__init__(redpanda, config, admin)
        self.failure_injector = FailureInjector(self.redpanda)
        self.is_reversible = True

    def max_affected_nodes_reached(self):
        return len(self.affected_nodes) >= self.config.max_affected_nodes

    def do_action(self):
        node = self.target_node()
        if node:
            self.redpanda.logger.info(
                f'executing action on {node.account.hostname}')
            self.failure_injector.inject_failure(
                FailureSpec(FailureSpec.FAILURE_KILL, node))
            self.affected_nodes.add(node)
            self.last_affected_node = node

            # Update started_nodes so storage validations are run
            # on the correct set of nodes later.
            self.redpanda.remove_from_started_nodes(node)
            return node
        else:
            self.redpanda.logger.warn(f'no usable node')
            return None

    def do_reverse_action(self):
        self._start_rp(node=self.last_affected_node)
        self.affected_nodes.remove(self.last_affected_node)
        self.redpanda.add_to_started_nodes(self.last_affected_node)

        last_affected_node, self.last_affected_node = self.last_affected_node, None
        return last_affected_node

    def _start_rp(self, node):
        self.failure_injector._start(node)
        wait_until(
            lambda: self.redpanda.redpanda_pid(node),
            timeout_sec=self.PROCESS_START_WAIT_SEC,
            backoff_sec=self.PROCESS_START_WAIT_BACKOFF,
            err_msg=
            f'Failed to start redpanda process on {node.account.hostname}')
        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(1, 10)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes)) - 1
                    node = self.redpanda.nodes[idx]

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(20, 45)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)
Exemple #6
0
    def test_controller_node_isolation(self):
        """
        Isolate controller node, expect cluster to be available
        """
        def controller_available():
            return self.redpanda.controller() is not None

        admin = Admin(self.redpanda)

        # wait for controller
        wait_until(controller_available,
                   timeout_sec=ELECTION_TIMEOUT * 2,
                   backoff_sec=1)

        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]
        self._expect_available()

        allocator_info = admin.wait_stable_configuration(
            "id_allocator",
            namespace="kafka_internal",
            replication=3,
            timeout_s=ELECTION_TIMEOUT * 2)

        # isolate controller
        with FailureInjector(self.redpanda) as fi:
            controller_id = self.redpanda.idx(
                self.redpanda.controller().account.hostname)
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.controller()))

            if allocator_info.leader == controller_id:
                hosts = [
                    n.account.hostname for n in self.redpanda.nodes
                    if self.redpanda.idx(n) != controller_id
                ]
                admin.await_stable_leader(
                    "id_allocator",
                    namespace="kafka_internal",
                    replication=3,
                    timeout_s=ELECTION_TIMEOUT * 2,
                    hosts=hosts,
                    check=lambda node_id: node_id != controller_id)

        connection = self.ping_pong()
        connection.ping_pong(timeout_s=10, retries=10)
        for i in range(0, 127):
            connection.ping_pong()
Exemple #7
0
        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while enable_failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1, NodeOperationFuzzyTest.max_suspend_duration_seconds)
                    node = random.choice(self.redpanda.nodes)
                else:
                    #kill/termianate only active nodes (not to influence the test outcome)
                    idx = random.choice(list(self.active_nodes))
                    node = self.redpanda.get_node(idx)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    NodeOperationFuzzyTest.min_inter_failure_time,
                    NodeOperationFuzzyTest.max_inter_failure_time)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)
Exemple #8
0
        def failure_injector_loop():
            f_injector = FailureInjector(self.redpanda)
            while failures:
                f_type = random.choice(FailureSpec.FAILURE_TYPES)
                length = 0
                node = random.choice(self.redpanda.nodes)
                while self.redpanda.idx(node) in suppressed:
                    node = random.choice(self.redpanda.nodes)

                # allow suspending any node
                if f_type == FailureSpec.FAILURE_SUSPEND:
                    length = random.randint(
                        1,
                        ConsumerOffsetsMigrationTest.max_suspend_duration_sec)

                f_injector.inject_failure(
                    FailureSpec(node=node, type=f_type, length=length))

                delay = random.randint(
                    ConsumerOffsetsMigrationTest.min_inter_failure_time_sec,
                    ConsumerOffsetsMigrationTest.max_inter_failure_time_sec)
                self.redpanda.logger.info(
                    f"waiting {delay} seconds before next failure")
                time.sleep(delay)
Exemple #9
0
    def test_follower_isolation(self):
        """
        Simplest HA test.  Stop the leader for our partition.  Validate that
        the cluster remains available afterwards, and that the expected
        peer takes over as the new leader.
        """
        admin = Admin(self.redpanda)
        # Find which node is the leader
        initial_leader_id, replicas = self._wait_for_leader()
        assert initial_leader_id == replicas[0]

        self._expect_available()

        leader_node = self.redpanda.get_node(initial_leader_id)
        self.logger.info(
            f"Initial leader {initial_leader_id} {leader_node.account.hostname}"
        )

        allocator_info = admin.wait_stable_configuration(
            "id_allocator",
            namespace="kafka_internal",
            replication=3,
            timeout_s=ELECTION_TIMEOUT * 2)

        follower = None
        for node in replicas:
            if node == initial_leader_id:
                continue
            if node == allocator_info.leader:
                continue
            follower = node
            break

        assert follower != None

        with FailureInjector(self.redpanda) as fi:
            # isolate one of the followers
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.get_node(follower)))

            # expect messages to be produced and consumed without a timeout
            connection = self.ping_pong()
            connection.ping_pong(timeout_s=10, retries=10)
            for i in range(0, 127):
                connection.ping_pong()
    def test_controller_node_isolation(self):
        """
        Isolate controller node, expect cluster to be available
        """
        def controller_available():
            return self.redpanda.controller() is not None

        # wait for controller
        wait_until(controller_available, timeout_sec=10, backoff_sec=1)

        # isolate controller
        with FailureInjector(self.redpanda) as fi:
            fi.inject_failure(
                FailureSpec(FailureSpec.FAILURE_ISOLATE,
                            self.redpanda.controller()))

        for i in range(0, 128):
            self._ping_pong()
    def test_metadata_request_does_not_contain_failed_node(
            self, failure, node):
        """
        Check if broker list returned from metadata request does not contain node
        which is not alive
        """
        # validate initial conditions
        wait_until(lambda: self.controller_present, 10, 1)
        rpk = RpkTool(self.redpanda)
        nodes = rpk.cluster_info()
        assert len(nodes) == 3
        redpanda_ids = [self.redpanda.idx(n) for n in self.redpanda.nodes]
        node_ids = [n.id for n in nodes]
        assert sorted(redpanda_ids) == sorted(node_ids)

        def get_node():
            if node == 'controller':
                return self.redpanda.controller()
            else:
                n = self.redpanda.nodes[0]
                while n == self.redpanda.controller():
                    n = random.choice(self.redpanda.nodes)
                return n

        node = get_node()
        node_id = self.redpanda.idx(node)
        self.redpanda.logger.info(
            f"Injecting failure on node {node.account.hostname} with id: {node_id}",
        )
        with FailureInjector(self.redpanda) as fi:
            if failure == "isolate":
                fi.inject_failure(
                    FailureSpec(FailureSpec.FAILURE_ISOLATE, node))
            else:
                self.redpanda.stop_node(node)

            rpk = RpkTool(self.redpanda)

            def contains_only_alive_nodes():
                nodes = rpk.cluster_info()
                returned_ids = [n.id for n in nodes]
                return len(nodes) == 2 and node_id not in returned_ids

            wait_until(contains_only_alive_nodes, 30, 1)
Exemple #12
0
 def inject_failure(self, spec):
     f_injector = FailureInjector(self.redpanda)
     f_injector.inject_failure(spec)
Exemple #13
0
 def __init__(self, redpanda: RedpandaService, config: ActionConfig,
              admin: Admin):
     super(ProcessKill, self).__init__(redpanda, config, admin)
     self.failure_injector = FailureInjector(self.redpanda)
     self.is_reversible = True