class TestNodeReplacementWithIO(ManageTest):
    """
    Knip-894 Node replacement proactive with IO

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive_with_io_running(self, pvc_factory,
                                                       pod_factory,
                                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive when IO running in the background

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_node_name = select_osd_node_name()

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        delete_and_create_osd_node(osd_node_name)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
Esempio n. 2
0
def test_deployment(pvc_factory, pod_factory):
    deploy = config.RUN['cli_params'].get('deploy')
    teardown = config.RUN['cli_params'].get('teardown')
    if not teardown or deploy:
        log.info("Verifying OCP cluster is running")
        assert is_cluster_running(config.ENV_DATA['cluster_path'])
        if not config.ENV_DATA['skip_ocs_deployment']:
            ocs_registry_image = config.DEPLOYMENT.get('ocs_registry_image')
            ocs_install_verification(ocs_registry_image=ocs_registry_image)

            nb_eps = config.DEPLOYMENT.get('noobaa_endpoints')
            if nb_eps > 1:
                change_noobaa_endpoints_count(nb_eps)

            # Check basic cluster functionality by creating resources
            # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
            # run IO and delete the resources
            sanity_helpers = Sanity()
            sanity_helpers.health_check()
            sanity_helpers.create_resources(pvc_factory, pod_factory)
            sanity_helpers.delete_resources()

    if teardown:
        log.info(
            "Cluster will be destroyed during teardown part of this test.")
Esempio n. 3
0
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=200)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 4
0
class TestDetachAttachWorkerVolume(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.polarion_id("OCS-1085")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory,
                                         pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Wait for the volumes to be re-attached back to the worker node
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]

        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume (logging is done inside the function)
        nodes.detach_volume(data_volume)

        # Validate cluster is still functional
        try:
            # In case the selected node that its volume disk was detached was the one
            # running the ceph tools pod, we'll need to wait for a new ct pod to start.
            # For that, a function that connects to the ct pod is being used to check if
            # it's alive
            _ = get_admin_key()
        except CommandFailed as ex:
            if "connection timed out" in str(ex):
                logger.info(
                    "Ceph tools box was running on the node that its data "
                    "volume has been detached. Hence, waiting for a new "
                    "Ceph tools box pod to spin up")
                wait_for_resource_count_change(
                    func_to_use=get_all_pods,
                    previous_num=1,
                    namespace=config.ENV_DATA['cluster_namespace'],
                    timeout=120,
                    selector='app=rook-ceph-tools')
            else:
                raise
        finally:
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Wait for worker volume to be re-attached automatically to the node
        assert nodes.wait_for_volume_attach(data_volume), (
            f"Volume {data_volume.id} failed to be re-attached to a worker node"
        )

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            'worker': nodes.get_node_by_attached_volume(vol),
            'volume': vol
        } for vol in data_volumes]

        for worker_and_volume in workers_and_volumes:
            # Detach the volume (logging is done inside the function)
            nodes.detach_volume(worker_and_volume['volume'])

        for worker_and_volume in workers_and_volumes:
            # Wait for worker volume to be re-attached automatically to the node
            assert nodes.wait_for_volume_attach(worker_and_volume['volume']), (
                f"Volume {worker_and_volume['volume']} "
                f"failed to be re-attached to a worker node")

        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume['worker']
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
Esempio n. 5
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272"))
        ]
    )
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        node.drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.skipif(
        condition=config.ENV_DATA['platform'] != 'AWS',
        reason="Tests are not running on AWS deployed cluster"
    )
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1293"))
        ]
    )
    def test_node_maintenance_restart_activate(
        self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type
    ):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node's ec2 instance
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node
        typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_node, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_node[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        node.drain_nodes([typed_node_name])

        instance = aws.get_instances_ids_and_names(typed_node)
        assert instance, f"Failed to get ec2 instances for node {typed_node_name}"

        # Restarting ec2 instance
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        node.wait_for_nodes_status(
            node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED
        )
        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier2
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271"))
        ]
    )
    def test_2_nodes_maintenance_same_type(
        self, pvc_factory, pod_factory, nodes_type
    ):
        """
        OCS-1273/OCs-1271:
        - Maintenance (mark as unscheduable and drain) 2 worker/master nodes
        - Mark the nodes as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 2 nodes
        typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(typed_node_names)

        # Mark the nodes back to schedulable
        node.schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            node.get_typed_nodes(
                node_type=node_type, num_of_nodes=1
            )[0] for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        node.schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")),
            pytest.param(*['cephfs'],
                         marks=pytest.mark.polarion_id("OCS-2101")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")
                    else:
                        raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 7
0
class TestNodesRestart(ManageTest):
    """
    Test ungraceful cluster shutdown
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """
        def finalizer():
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(finalizer)

    @pytest.mark.parametrize(
        argnames=["force"],
        argvalues=[
            pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")),
            pytest.param(*[False],
                         marks=[
                             pytest.mark.polarion_id("OCS-895"),
                             aws_platform_required
                         ])
        ])
    def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force):
        """
        Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs)

        """
        ocp_nodes = get_node_objs()
        nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force)
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla('1754287')
    @pytest.mark.polarion_id("OCS-2015")
    def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory):
        """
        Test restart nodes one after the other and check health status in between

        """
        ocp_nodes = get_node_objs()
        for node in ocp_nodes:
            nodes.restart_nodes(nodes=[node], wait=False)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @pytest.mark.parametrize(
        argnames=["interface", "operation"],
        argvalues=[
            pytest.param(*['rbd', 'create_resources'],
                         marks=pytest.mark.polarion_id("OCS-1138")),
            pytest.param(*['rbd', 'delete_resources'],
                         marks=pytest.mark.polarion_id("OCS-1241")),
            pytest.param(*['cephfs', 'create_resources'],
                         marks=pytest.mark.polarion_id("OCS-1139")),
            pytest.param(*['cephfs', 'delete_resources'],
                         marks=pytest.mark.polarion_id("OCS-1242"))
        ])
    def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node(
            self, nodes, pvc_factory, pod_factory, interface, operation):
        """
        Test PV provisioning under degraded state -
        stop the node that has the provisioner pod running on

        OCS-1138:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        provisioner_pods = None
        # Get the provisioner pod according to the interface
        if interface == 'rbd':
            provisioner_pods = pod.get_rbdfsplugin_provisioner_pods()
        elif interface == 'cephfs':
            provisioner_pods = pod.get_cephfsplugin_provisioner_pods()
        provisioner_pod = provisioner_pods[0]

        # Making sure that the node is not running the rook operator pod:
        provisioner_node = pod.get_pod_node(provisioner_pod)
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get(
                'name') == provisioner_node.get().get('metadata').get('name'):
            provisioner_pod = provisioner_pods[1]

        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get().get('metadata').get(
            'name')
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Stopping the nodes
        nodes.stop_nodes(nodes=[provisioner_node])

        # Wait for the provisioner pod to get to running status
        selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if (
            interface
            == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for {interface} provisioner pod to reach status Running")
        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2
        ), f"{interface} provisioner pod failed to reach status Running"

        logger.info(f"{interface} provisioner pod has reached status Running")
        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[provisioner_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()

    @pytest.mark.parametrize(
        argnames=["operation"],
        argvalues=[
            pytest.param(*['create_resources'],
                         marks=[pytest.mark.polarion_id("OCS-2016")]),
            pytest.param(*['delete_resources'],
                         marks=[pytest.mark.polarion_id("OCS-2017")]),
        ])
    def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node(
            self, nodes, pvc_factory, pod_factory, operation):
        """
        Test PV provisioning under degraded state -
        stop the node that has the rook operator pod running on

        OCS-2016:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-2017:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources
        - Start the worker node
        - Check cluster and Ceph health
        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        rook_operator_pods = pod.get_operator_pods()
        rook_operator_pod = rook_operator_pods[0]

        rook_operator_pod_name = rook_operator_pod.name
        logger.info(f"rook operator pod found: {rook_operator_pod_name}")

        # Get the node name that has the rook operator pod running on
        operator_node = pod.get_pod_node(rook_operator_pod)
        operator_node_name = operator_node.get().get('metadata').get('name')
        logger.info(
            f"{rook_operator_pod_name} pod is running on node {operator_node_name}"
        )

        # Stopping the node
        nodes.stop_nodes(nodes=[operator_node])

        # Wait for the rook operator pod to get to running status
        selector = constants.OPERATOR_LABEL

        # Wait for the rook operator pod to reach Terminating status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Terminating"
        )
        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=rook_operator_pod_name,
            condition=constants.STATUS_TERMINATING
        ), "rook operator pod failed to reach status Terminating"
        logger.info(
            f"Pod {rook_operator_pod_name} has reached status Terminating")

        # Wait for the rook operator pod to be started and reach running status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Running"
        )

        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=1
        ), "rook operator pod failed to reach status Running"
        logger.info("rook operator pod has reached status Running")

        assert wait_for_ct_pod_recovery(
        ), "Ceph tools pod failed to come up on another node"

        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)

            self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[operator_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes - Reactive
    """
    threads = []

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()
            ceph_health_check()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(*['rbd', 'shutdown'],
                         marks=[
                             pytest.mark.polarion_id("OCS-2102"),
                             pytest.mark.bugzilla("1830015")
                         ]),
            pytest.param(*['rbd', 'terminate'],
                         marks=pytest.mark.polarion_id("OCS-2103")),
            pytest.param(*['cephfs', 'shutdown'],
                         marks=[
                             pytest.mark.polarion_id("OCS-2104"),
                             pytest.mark.bugzilla("1830015")
                         ]),
            pytest.param(*['cephfs', 'terminate'],
                         marks=pytest.mark.polarion_id("OCS-2105")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")

            # Check all OCS pods status, they should be in running state
            all_pod_obj = pod.get_all_pods(
                namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            for pod_obj in all_pod_obj:
                if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")
                        continue

                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=240)
        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")),
            pytest.param(*['cephfs'],
                         marks=pytest.mark.polarion_id("OCS-2101")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 10
0
class TestDetachAttachWorkerVolumeAWS(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.polarion_id("OCS-1085")
    def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Attach back the volume to the node
        - Restart the node so the volume will get re-mounted

        """
        # Requesting 1 worker node for the test as this case includes detach and
        # attach of data volume of 1 worker node
        worker = node.get_typed_nodes(num_of_nodes=1)
        assert worker, "Failed to find a worker node for the test"
        worker = worker[0]

        # Get the worker node's ec2 instance ID and name
        instance = aws.get_instances_ids_and_names([worker])
        assert instance, f"Failed to get ec2 instances for node {worker.name}"

        instance_id = [*instance][0]

        # Get the ec2 instance data volume Volume instance
        ec2_volume = aws.get_data_volumes(instance_id)[0]

        # Detach volume (logging is done inside the function)
        aws_obj.detach_volume(ec2_volume)

        # Validate cluster is still functional
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Attach volume (logging is done inside the function)
        aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instance so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        # Cluster health check
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data volume from 2 of the worker nodes
        - Attach back the volume to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Requesting 2 worker nodes for the test as this case includes
        # detach and attach of data volume of 1 worker node
        workers = node.get_typed_nodes(num_of_nodes=2)
        assert workers, "Failed to find worker nodes for the test"

        # Get the worker nodes ec2 instance IDs and names
        instances = aws.get_instances_ids_and_names(workers)
        assert instances, (
            f"Failed to get ec2 instances for node {[w.name for w in workers]}"
        )

        for instance in instances.items():
            instance_id = [*instance][0]

            # Get the ec2 instance data volume Volume instance
            ec2_volume = aws.get_data_volumes(instance_id)[0]

            # Detach volume (logging is done inside the function)
            aws_obj.detach_volume(ec2_volume)

            # Attach volume (logging is done inside the function)
            aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instances so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instances, wait=True)

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
Esempio n. 11
0
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Reactive
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):

        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(
                *['rbd', 'power off'],
                marks=pytest.mark.polarion_id("OCS-2118")
            ),
            pytest.param(
                *['rbd', 'network failure'],
                marks=pytest.mark.polarion_id("OCS-2120")
            ),
            pytest.param(
                *['cephfs', 'power off'],
                marks=pytest.mark.polarion_id("OCS-2119")
            ),
            pytest.param(
                *['cephfs', 'network failure'],
                marks=pytest.mark.polarion_id("OCS-2121")
            ),
        ]
    )
    def test_node_replacement_reactive_aws_ipi(
        self, nodes, pvc_factory, pod_factory, dc_pod_factory,
        failure, interface
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(
                interface=interface, node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name
        )
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name
        )
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}"
        )

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(
            annotation=annotation, machine_name=machine_name
        )

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes)
        )
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(
            resource_name=new_spun_node[0],
            label=constants.OPERATOR_NODE_LABEL
        )
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label"
        )

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(
            dc_pod_obj, timeout=1200
        )
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE
        )
        for pod_obj in all_pod_obj:
            if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj, state=constants.STATUS_RUNNING,
                        timeout=1800
                    )
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE
                        )
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 12
0
class TestNodesRestart(ManageTest):
    """
    Test ungraceful cluster shutdown
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["force"],
        argvalues=[
            pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")),
            pytest.param(*[False], marks=pytest.mark.polarion_id("OCS-895"))
        ])
    def test_nodes_restart_aws(self, ec2_instances, aws_obj, pvc_factory,
                               pod_factory, force):
        """
        Test ungraceful cluster shutdown - AWS
        """
        aws_obj.restart_ec2_instances(instances=ec2_instances,
                                      wait=True,
                                      force=force)
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @pytest.mark.parametrize(
        argnames=["interface", "operation"],
        argvalues=[
            pytest.param(*['rbd', 'create_resources'],
                         marks=pytest.mark.polarion_id("OCS-1138")),
            pytest.param(*['rbd', 'delete_resources'],
                         marks=pytest.mark.polarion_id("OCS-1241")),
            pytest.param(*['cephfs', 'create_resources'],
                         marks=pytest.mark.polarion_id("OCS-1139")),
            pytest.param(*['cephfs', 'delete_resources'],
                         marks=pytest.mark.polarion_id("OCS-1242"))
        ])
    def test_pv_provisioning_under_degraded_state(self, ec2_instances, aws_obj,
                                                  pvc_factory, pod_factory,
                                                  interface, operation):
        """
        Test PV provisioning under degraded state

        OCS-1138:
        - Stop 1 ec2 instance worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 ec2 instance worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 ec2 instance worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 ec2 instance worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        provisioner_pod = None

        # Get the provisioner pod according to the interface
        if interface == 'rbd':
            provisioner_pod = pod.get_rbdfsplugin_provisioner_pods()[0]
        elif interface == 'cephfs':
            provisioner_pod = pod.get_cephfsplugin_provisioner_pods()[0]
        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get('metadata').get('name')
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Get the ec2 instance of the node
        instances = aws.get_instances_ids_and_names([provisioner_node])
        assert instances, (
            f"Failed to get ec2 instances for node {provisioner_node_name}")

        # Stopping the nodes
        aws_obj.stop_ec2_instances(instances=instances, wait=True)

        # Wait for the provisioner pod to get to running status
        selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if (
            interface
            == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Running")
        logger.info(f"Pod {provisioner_pod_name} has reached status Running")

        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2
        ), f"{interface} provisioner pod failed to reach status Running"

        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        aws_obj.start_ec2_instances(instances=instances, wait=True)

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()
Esempio n. 13
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1272"))
        ])
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*['master'],
                         marks=[
                             pytest.mark.polarion_id("OCS-1293"),
                             bugzilla('1754287')
                         ])
        ])
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier3
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1271"))
        ])
    def test_2_nodes_maintenance_same_type(self, nodes_type):
        """
        OCS-1273/OCs-1271:
        - Try draining 2 nodes from the same type - should fail
        - Check cluster and Ceph health

        """
        # Get 2 nodes
        typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Try draining 2 nodes - should fail
        try:
            drain_nodes(typed_node_names)
        except TimeoutExpired:
            logger.info(
                f"Draining of nodes {typed_node_names} failed as expected")

        schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            get_typed_nodes(node_type=node_type, num_of_nodes=1)[0]
            for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 14
0
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                try:
                    wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=200)
                # 'rook-ceph-crashcollector' on the failed node stucks at pending
                # state. BZ 1810014 tracks it.
                # Ignoring 'rook-ceph-crashcollector' pod health check as WA and
                # deleting its deployment so that the pod disappears
                # Will revert this WA once the BZ is fixed
                except ResourceWrongStatusException:
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP()
                        name = pod_obj.name[:-17]
                        command = f"delete deployment {name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 15
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n
                .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()
        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @aws_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Wait for the volumes to be re-attached back to the worker node
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume (logging is done inside the function)
        nodes.detach_volume(data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert wait_for_ct_pod_recovery(), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Wait for worker volume to be re-attached automatically to the node
        assert nodes.wait_for_volume_attach(data_volume), (
            "Volume failed to be re-attached to a worker node"
        )

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @aws_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [
            {'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol}
            for vol in data_volumes
        ]

        for worker_and_volume in workers_and_volumes:
            # Detach the volume (logging is done inside the function)
            nodes.detach_volume(
                worker_and_volume['volume'], nodes.detach_volume(worker_and_volume['worker'])
            )

        for worker_and_volume in workers_and_volumes:
            # Wait for worker volume to be re-attached automatically to the node
            assert nodes.wait_for_volume_attach(worker_and_volume['volume']), (
                f"Volume {worker_and_volume['volume']} "
                f"failed to be re-attached to a worker node"
            )

        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes(
            [worker_and_volume['worker'] for worker_and_volume in workers_and_volumes]
        )

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla('1830702')
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1787236#c16

        """
        logger.info("Picking a PV which will be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get('spec').get('claimRef').get('name')

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name][0]

        # Get the corresponding OSD pod
        logger.info(f"Getting the corresponding OSD pod of PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods if osd_pod.get()
            .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]

        # Get the node that has the OSD pod running on
        logger.info(f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        volume_size = osd_pvc.size
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get('metadata')
            .get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get('labels').get('job-name')
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the corresponding OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments() if osd_pod.get()
            .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]

        # Delete the volume from the platform side
        logger.info(f"Deleting volume {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Delete the OSD deployment
        osd_deployment_name = osd_deployment.name
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120)

        # Delete the OSD prepare job
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120)

        # Delete the OSD PVC
        osd_pvc_name = osd_pvc.name
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Recreate a volume from the platform side
        logger.info("Creating a replacing volume from the platform side")
        nodes.create_and_attach_volume(osd_node, volume_size)

        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info("Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count
        ), (
            f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
        )
        # Wait for OSD pod to get created and reach Running state
        logger.info("Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count
        ), (
            f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
        )

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=80)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
Esempio n. 16
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def health_checker(self):
        """
        Check Ceph health

        """
        try:
            status = ceph_health_check_base()
            if status:
                log.info("Health check passed")
        except CephHealthException as e:
            # skip because ceph is not in good health
            pytest.skip(str(e))

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1272"))
        ])
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*['master'],
                         marks=[
                             pytest.mark.polarion_id("OCS-1293"),
                             bugzilla('1754287')
                         ])
        ])
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier3
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1271"))
        ])
    def test_2_nodes_maintenance_same_type(self, nodes_type):
        """
        OCS-1273/OCs-1271:
        - Try draining 2 nodes from the same type - should fail
        - Check cluster and Ceph health

        """
        # Get 2 nodes
        typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Try draining 2 nodes - should fail
        try:
            drain_nodes(typed_node_names)
        except TimeoutExpired:
            log.info(
                f"Draining of nodes {typed_node_names} failed as expected")

        schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            get_typed_nodes(node_type=node_type, num_of_nodes=1)[0]
            for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2128")),
            pytest.param(*['cephfs'],
                         marks=pytest.mark.polarion_id("OCS-2129")),
        ])
    def test_simultaneous_drain_of_two_ocs_nodes(self, pvc_factory,
                                                 pod_factory, dc_pod_factory,
                                                 interface):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key='dc',
                          label_value='fedora')
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes - Reactive
    """
    threads = []

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()
            ceph_health_check()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(*['rbd', 'shutdown'],
                         marks=[
                             pytest.mark.polarion_id("OCS-2102"),
                             pytest.mark.bugzilla("1830015")
                         ]),
            pytest.param(*['rbd', 'terminate'],
                         marks=pytest.mark.polarion_id("OCS-2103")),
            pytest.param(*['cephfs', 'shutdown'],
                         marks=[
                             pytest.mark.polarion_id("OCS-2104"),
                             pytest.mark.bugzilla("1830015")
                         ]),
            pytest.param(*['cephfs', 'terminate'],
                         marks=pytest.mark.polarion_id("OCS-2105")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 18
0
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Proactive

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive(self, pvc_factory, pod_factory,
                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        # Unscheduling node
        node.unschedule_nodes([osd_node_name])
        # Draining Node
        node.drain_nodes([osd_node_name])
        log.info("Getting machine name from specified node name")
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"Node {osd_node_name} associated machine is {machine_name}")
        log.info(
            f"Deleting machine {machine_name} and waiting for new machine to come up"
        )
        machine.delete_machine_and_check_state_of_new_spinned_machine(
            machine_name)
        new_machine_list = machine.get_machines()
        for machines in new_machine_list:
            # Trimming is done to get just machine name
            # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr
            # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b
            if re.match(machines.name[:-6], machine_name):
                new_machine_name = machines.name
        machineset_name = machine.get_machineset_from_machine_name(
            new_machine_name)
        log.info("Waiting for new worker node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)
        new_node_name = node.get_node_from_machine_name(new_machine_name)
        log.info("Adding ocs label to newly created worker node")
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(resource_name=new_node_name,
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_node_name} with OCS storage label")
        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check()
Esempio n. 19
0
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Proactive

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20)
                pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20)
                pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

        if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM:
            if config.ENV_DATA['deployment_type'] == 'ipi':
                node.delete_and_create_osd_node_aws_ipi(osd_node_name)

            elif config.ENV_DATA['deployment_type'] == 'upi':
                node.delete_and_create_osd_node_aws_upi(osd_node_name)
            else:
                pytest.fail(
                    f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, "
                    f"results of this test run are all invalid.")

        elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                        )

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=30)
Esempio n. 20
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    def detach_volume_and_wait_for_attach(self, nodes, data_volume,
                                          worker_node):
        """
         Detach an EBS volume from an AWS instance and wait for the volume
         to be re-attached

         Args:
             node (OCS): The OCS object representing the node
             data_volume (Volume): The ec2 volume to delete
             worker_node (OCS): The OCS object of the EC2 instance

         """
        try:
            # Detach volume (logging is done inside the function)
            nodes.detach_volume(data_volume, worker_node)
        except AWSTimeoutException as e:
            if "Volume state: in-use" in e:
                logger.info(
                    f"Volume {data_volume} re-attached successfully to worker"
                    f" node {worker_node}")
            else:
                raise
        else:
            """
            Wait for worker volume to be re-attached automatically
            to the node
            """
            assert nodes.wait_for_volume_attach(data_volume), (
                f"Volume {data_volume} failed to be re-attached to worker "
                f"node {worker_node}")

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if pod.get().get('status').get('containerStatuses')[0].get(
                        'state') == constants.STATUS_CLBO:
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @aws_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    @bugzilla('1825675')
    def test_detach_attach_worker_volume(self, nodes, pvc_factory,
                                         pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Wait for the volumes to be re-attached back to the worker node
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be
          unhealthy) by creating resources and running IO
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume and wait for the volume to attach
        self.detach_volume_and_wait_for_attach(nodes, data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert wait_for_ct_pod_recovery(
        ), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @aws_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            'worker': nodes.get_node_by_attached_volume(vol),
            'volume': vol
        } for vol in data_volumes]
        for worker_and_volume in workers_and_volumes:
            # Detach volume and wait for the volume to attach
            self.detach_volume_and_wait_for_attach(nodes,
                                                   worker_and_volume['volume'],
                                                   worker_and_volume['worker'])
        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume['worker']
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla('1830702')
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        logger.info("Picking a PV which to be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get('spec').get('claimRef').get('name')

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [
            ds for ds in osd_pvcs
            if ds.get().get('metadata').get('name') == claim_name
        ][0]

        # Get the corresponding OSD pod and ID
        logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods
            if osd_pod.get().get('metadata').get('labels').get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        logger.info(f"OSD_POD {osd_pod.name}")
        osd_id = osd_pod.get().get('metadata').get('labels').get('ceph-osd-id')

        # Get the node that has the OSD pod running on
        logger.info(
            f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get('metadata').get(
                'labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get(
            'labels').get('job-name')
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments()
            if osd_pod.get().get('metadata').get('labels').get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_deployment_name = osd_deployment.name

        # Delete the volume from the platform side
        logger.info(f"Deleting {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Scale down OSD deployment
        logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
        ocp.OCP().exec_oc_cmd(
            f"scale --replicas=0 deployment/{osd_deployment_name}")

        # Force delete OSD pod if necessary
        osd_pod_name = osd_pod.name
        logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
        try:
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
        except TimeoutError:
            osd_pod.delete(force=True)
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

        # Run ocs-osd-removal job
        logger.info(f"Executing OSD removal job on OSD-{osd_id}")
        osd_removal_job_yaml = ocp.OCP(
            namespace=config.ENV_DATA['cluster_namespace']).exec_oc_cmd(
                f"process ocs-osd-removal"
                f" -p FAILED_OSD_ID={osd_id} -o yaml")
        osd_removal_job = OCS(**osd_removal_job_yaml)
        osd_removal_job.create(do_reload=False)

        # Get ocs-osd-removal pod name
        logger.info("Getting the ocs-osd-removal pod name")
        osd_removal_pod_name = get_pod_name_by_pattern(
            f"ocs-osd-removal-{osd_id}")[0]
        osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name,
                                          namespace='openshift-storage')
        osd_removal_pod_obj.ocp.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=osd_removal_pod_name)

        # Verify OSD removal from the ocs-osd-removal pod logs
        logger.info(
            f"Verifying removal of OSD from {osd_removal_pod_name} pod logs")
        logs = get_pod_logs(osd_removal_pod_name)
        pattern = f"purged osd.{osd_id}"
        assert re.search(pattern, logs)

        # Delete the OSD prepare job
        logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name,
                                            timeout=120)

        # Delete the OSD PVC
        osd_pvc_name = osd_pvc.name
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Delete the OSD deployment
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name,
                                           timeout=120)

        # Delete PV
        logger.info(f"Verifying deletion of PV {osd_pv_name}")
        try:
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
        except TimeoutError:
            osd_pv.delete()
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(
            f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

        # Delete the OSD removal job
        logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
        osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}")
        osd_removal_job.delete()
        osd_removal_job.ocp.wait_for_delete(
            resource_name=f"ocs-osd-removal-{osd_id}")

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info(
            "Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_BOUND,
            selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
            )
        # Wait for OSD pod to get created and reach Running state
        logger.info(
            "Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
            )

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=80)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)