Esempio n. 1
0
    def validate_cluster(
        self,
        cluster_check=False,
        node_status=False,
        pod_status=False,
        operation_name="",
    ):
        """
        Validates various ceph and ocs cluster checks

        Args:
            node_status (bool): Verifies node is Ready
            pod_status (bool): Verifies StorageCluster pods in expected state
            operation_name (str): Name of the operation, to Tag

        """
        logger.info(f"{operation_name}: Verifying cluster health")
        assert ceph_health_check(
            defaults.ROOK_CLUSTER_NAMESPACE, tries=100
        ), "Entry criteria FAILED: Cluster is Unhealthy"
        if cluster_check:
            self.sanity_helpers.health_check(tries=100)
        if node_status:
            logger.info(f"{operation_name}: Verifying whether node is ready")
            wait_for_nodes_status(status=constants.NODE_READY, timeout=300)
        if pod_status:
            logger.info(
                f"{operation_name}: Verifying StorageCluster pods are in running/completed state"
            )
            wait_for_storage_pods()
    def test_registry_rolling_reboot_node(self, node_type, nodes):
        """
        Test registry workload when backed by OCS and reboot node one by one
        """

        # Get the node list
        node_list = get_nodes(node_type)

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Validate image exists in registries path
        validate_image_exists()

        for node in node_list:

            # Reboot node
            log.info(node.name)
            nodes.restart_nodes([node], wait=False)

            # Wait some time after rebooting node
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate all nodes and services are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_cluster_connectivity)(tries=400)
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists()
    def test_rgw_host_node_failure(
        self, nodes, node_restart_teardown, mcg_obj, bucket_factory
    ):
        """
        Test case to fail node where RGW and Noobaa-db-0 hosting
        and verify new pod spuns on healthy node

        """
        # Get rgw pods
        rgw_pod_obj = get_rgw_pods()

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name == "noobaa-db-0":
                noobaa_pod_node = get_pod_node(noobaa_pod)

        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(
                    f"Stopping node {pod_node} where"
                    f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted"
                )
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(
                    resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720
                )

                # Validate new rgw pod spun
                ocp_obj = OCP(
                    kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE
                )
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1")

                # Start the node
                nodes.start_nodes(node_obj)

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()

        # Verify all storage pods are running
        wait_for_storage_pods()
Esempio n. 4
0
    def test_registry_shutdown_and_recovery_node(self, nodes):
        """
        Test registry workload when backed by OCS and
        its impact when node is shutdown and recovered

        """

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template="eap-cd-basic-s2i",
            image=
            "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest",
            pattern="eap-app",
        )

        # Get the node list
        node_list = get_nodes(node_type="worker")

        for node in node_list:

            # Stop node
            nodes.stop_nodes(nodes=[node])

            # Validate node reached NotReady state
            wait_for_nodes_status(node_names=[node.name],
                                  status=constants.NODE_NOT_READY)

            # Start node
            nodes.start_nodes(nodes=[node])

            # Validate all nodes are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)
Esempio n. 5
0
def test_storage_pods_running(multiregion_mirror_setup_session):
    """
    Test that all pods from openshift-storage namespace have status Running
    or Completed after upgrade is completed.

    multiregion_mirror_setup_session fixture is present during this test to
    make sure that NooBaa backing stores from other upgrade tests were
    not yet deleted. This is done to test scenario from BZ 1823775.

    """
    wait_for_storage_pods(timeout=10), "Some pods were not in expected state"
Esempio n. 6
0
    def test_rolling_reboot_node(self, node_type):
        """
        Test to rolling reboot of nodes
        """

        # Get info from SCALE_DATA_FILE for validation
        if os.path.exists(SCALE_DATA_FILE):
            file_data = templating.load_yaml(SCALE_DATA_FILE)
            namespace = file_data.get("NAMESPACE")
            pod_scale_list = file_data.get("POD_SCALE_LIST")
            pvc_scale_list = file_data.get("PVC_SCALE_LIST")
        else:
            raise FileNotFoundError

        node_list = list()

        # Rolling reboot nodes
        if node_type == constants.WORKER_MACHINE:
            tmp_list = get_nodes(node_type=node_type)
            ocs_node_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
            for tmp in tmp_list:
                if tmp.name in ocs_node_list:
                    node_list.append(tmp)
        else:
            node_list = get_nodes(node_type=node_type)

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in node_list:
            nodes.restart_nodes(nodes=[node])
            scale_lib.validate_node_and_oc_services_are_up_after_reboot()

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate cluster health ok and all pods are running
        assert utils.ceph_health_check(
            delay=180
        ), "Ceph health in bad state after node reboots"

        # Validate all PVCs from namespace are in Bound state
        assert scale_lib.validate_all_pvcs_and_check_state(
            namespace=namespace, pvc_scale_list=pvc_scale_list
        )

        # Validate all PODs from namespace are up and running
        assert scale_lib.validate_all_pods_and_check_state(
            namespace=namespace, pod_scale_list=pod_scale_list
        )
    def test_registry_reboot_node(self, node_type, nodes):
        """
        Test registry workload when backed by OCS and reboot node
        """

        # Get the node list
        node = get_nodes(node_type, num_of_nodes=1)

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template="eap-cd-basic-s2i",
            image=
            "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest",
            pattern="eap-app",
        )

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)

        # Reboot one node
        nodes.restart_nodes(node, wait=False)

        # Validate all nodes and services are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_cluster_connectivity)(tries=400)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status)(timeout=900)

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)
    def test_restore_ceph_mon_quorum(
        self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
    ):
        """
        Test Procedure:

        1) With all the preconditions met, take mons out of quorum
        2) Follow the procedure mentioned in
           https://access.redhat.com/solutions/5898541 to recover the quorum
        3) Check for data access and integrity of block,
           object and file based data - No DU/DL/DC
        4) Make sure basic functionality working fine

        """

        # Take mons out of the quorum and confirm it
        (
            self.mon_pod_obj_list,
            mon_pod_running,
            ceph_mon_daemon_id,
        ) = induce_mon_quorum_loss()

        # Recover mon quorum
        recover_mon_quorum(self.mon_pod_obj_list, mon_pod_running, ceph_mon_daemon_id)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Remove crash list from ceph health
        log.info("Silence the ceph warnings by “archiving” the crash")
        tool_pod = get_ceph_tools_pod()
        tool_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all", format=None)
        log.info("Removed ceph crash warnings. Check for ceph and cluster health")

        # Validate cluster health
        self.sanity_helpers.health_check(tries=40)

        # ToDo: Common system test case validation: Check for data integrity and corruption after mon recovery

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        # Deleting Resources
        self.sanity_helpers.delete_resources()
Esempio n. 9
0
    def test_delete_local_volume_sym_link(self):
        """
        Delete sym link on LSO Cluster
        """
        # Get rook-ceph-crashcollector pod objects
        crashcollector_pods = get_pod_name_by_pattern(
            pattern="rook-ceph-crashcollector",
            namespace=ROOK_CLUSTER_NAMESPACE)
        crashcollector_pods_objs = []
        for crashcollector_pod in crashcollector_pods:
            crashcollector_pods_objs.append(
                get_pod_obj(name=crashcollector_pod,
                            namespace=ROOK_CLUSTER_NAMESPACE))

        # Get Node object
        node_obj = get_pod_node(pod_obj=crashcollector_pods_objs[0])

        # Get Sym link
        osd_pvcs = get_deviceset_pvcs()
        pv_name = osd_pvcs[0].data["spec"]["volumeName"]
        ocp_obj = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE, kind=constants.PV)
        pv_obj = ocp_obj.get(resource_name=pv_name)
        path = pv_obj["spec"]["local"]["path"]

        log.info("Delete sym link")
        oc_cmd = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE)
        cmd = f"rm -rfv {path}"
        oc_cmd.exec_oc_debug_cmd(node=node_obj.name, cmd_list=[cmd])

        log.info(
            "Waiting for rook-ceph-crashcollector pods to be reach Running state"
        )
        for crashcollector_pods_obj in crashcollector_pods_objs:
            wait_for_resource_state(resource=crashcollector_pods_obj,
                                    state=constants.STATUS_RUNNING)

        # Check all OCS pods status, they should be in Running or Completed state
        wait_for_storage_pods()

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
Esempio n. 10
0
    def test_noobaa_db_backup_and_recovery(
        self,
        pvc_factory,
        pod_factory,
        snapshot_factory,
        bucket_factory,
        rgw_bucket_factory,
        noobaa_db_backup_and_recovery,
    ):
        """
        Test case to verify noobaa backup and recovery

        1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC
        2. Scale down the statefulset noobaa-db
        3. Get the yaml of the current PVC, db-noobaa-db-0 and
           change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC
        4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed.
           The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’
        5. Edit again restore PV and remove the claimRef section.
           The volume will transition to Available.
        6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC.
        7. Scale up the stateful set again and the pod should be running

        """
        noobaa_db_backup_and_recovery(snapshot_factory=snapshot_factory)

        # Verify all storage pods are running
        wait_for_storage_pods()

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 12
0
    def test_monitor_recovery(
        self,
        dc_pod_factory,
        mcg_obj,
        bucket_factory,
    ):
        """
        Verifies Monitor recovery procedure as per:
        https://access.redhat.com/documentation/en-us/red_hat_openshift_container_storage/4.8/html/troubleshooting_openshift_container_storage/restoring-the-monitor-pods-in-openshift-container-storage_rhocs

        """
        # Initialize mon recovery class
        mon_recovery = MonitorRecovery()

        logger.info("Corrupting ceph monitors by deleting store.db")
        corrupt_ceph_monitors()

        logger.info("Backing up all the deployments")
        mon_recovery.backup_deployments()
        dep_revert, mds_revert = mon_recovery.deployments_to_revert()

        logger.info("Starting the monitor recovery procedure")
        logger.info("Scaling down rook and ocs operators")
        mon_recovery.scale_rook_ocs_operators(replica=0)

        logger.info(
            "Preparing script and patching OSDs to remove LivenessProbe and sleep to infinity"
        )
        mon_recovery.prepare_monstore_script()
        mon_recovery.patch_sleep_on_osds()
        switch_to_project(constants.OPENSHIFT_STORAGE_NAMESPACE)

        logger.info("Getting mon-store from OSDs")
        mon_recovery.run_mon_store()

        logger.info("Patching MONs to sleep infinitely")
        mon_recovery.patch_sleep_on_mon()

        logger.info("Updating initial delay on all monitors")
        update_mon_initial_delay()

        logger.info("Generating monitor map command using the IPs")
        mon_map_cmd = generate_monmap_cmd()

        logger.info("Getting ceph keyring from ocs secrets")
        mon_recovery.get_ceph_keyrings()

        logger.info("Rebuilding Monitors to recover store db")
        mon_recovery.monitor_rebuild(mon_map_cmd)

        logger.info("Reverting mon, osd and mgr deployments")
        mon_recovery.revert_patches(dep_revert)

        logger.info("Scaling back rook and ocs operators")
        mon_recovery.scale_rook_ocs_operators(replica=1)

        logger.info("Recovering CephFS")
        mon_recovery.scale_rook_ocs_operators(replica=0)
        logger.info(
            "Patching MDSs to remove LivenessProbe and setting sleep to infinity"
        )
        mon_recovery.patch_sleep_on_mds()
        logger.info("Resetting the fs")
        ceph_fs_recovery()
        logger.info("Reverting MDS deployments")
        mon_recovery.revert_patches(mds_revert)
        logger.info("Scaling back rook and ocs operators")
        mon_recovery.scale_rook_ocs_operators(replica=1)
        logger.info("Recovering mcg by re-spinning the pods")
        recover_mcg()
        remove_global_id_reclaim()
        for pod_obj in self.dc_pods:
            pod_obj.delete(force=True)
        new_md5_sum = []
        logger.info("Verifying md5sum of files after recovery")
        for pod_obj in get_spun_dc_pods(self.dc_pods):
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=10,
            )
            new_md5_sum.append(pod.cal_md5sum(pod_obj, self.filename))
        logger.info(f"Md5sum calculated after recovery: {new_md5_sum}")
        if collections.Counter(new_md5_sum) == collections.Counter(self.md5sum):
            logger.info(
                f"Verified: md5sum of {self.filename} on pods matches with the original md5sum"
            )
        else:
            assert False, f"Data corruption found {new_md5_sum} and {self.md5sum}"
        logger.info("Getting object after recovery")
        assert bucket_utils.s3_get_object(
            s3_obj=mcg_obj,
            bucketname=self.bucket_name,
            object_key=self.object_key,
        ), "Failed: GetObject"

        # New pvc, dc pods, obcs
        new_dc_pods = [
            dc_pod_factory(
                interface=constants.CEPHBLOCKPOOL,
            ),
            dc_pod_factory(
                interface=constants.CEPHFILESYSTEM,
            ),
        ]
        for pod_obj in new_dc_pods:
            pod_obj.exec_cmd_on_pod(command=self.dd_cmd)
        logger.info("Creating new bucket and write object")
        new_bucket = bucket_factory(interface="OC")[0].name
        assert bucket_utils.s3_put_object(
            s3_obj=mcg_obj,
            bucketname=new_bucket,
            object_key=self.object_key,
            data=self.object_data,
        ), "Failed: PutObject"
        wait_for_storage_pods()
        logger.info("Archiving the ceph crash warnings")
        tool_pod = get_ceph_tools_pod()
        tool_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all", format=None)
        self.sanity_helpers.health_check(tries=10)
    def test_node_replacement_reactive_aws_ipi(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        failure,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(
                f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(annotation=annotation,
                                          machine_name=machine_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes))
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind="node")
        node_obj.add_label(resource_name=new_spun_node[0],
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label")

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_check_pod_status_after_two_nodes_shutdown_recovery(
            self, nodes, node_restart_teardown):
        """
        Test case to check MDS pods rbd and cephfs plugin Provisioner
        pods not running on same node post shutdown and recovery node

        """

        # Get MDS, rbd, cephfs plugin provisioner pods running nodes
        # before shutdown

        log.info("Check pod nodes before nodes shutdown")
        list_of_nodes_running_pods(selector="rook-ceph-mds")

        list_of_nodes_running_pods(selector="csi-rbdplugin-provisioner")

        list_of_nodes_running_pods(selector="csi-cephfsplugin-provisioner")

        # Get the node list
        node = get_nodes(node_type="worker", num_of_nodes=2)

        # Shutdown 2 worker nodes for 10 mins
        nodes.stop_nodes(nodes=node)

        waiting_time = 600
        log.info(f"Waiting for {waiting_time} seconds")
        time.sleep(waiting_time)

        nodes.start_nodes(nodes=node)

        # Validate all nodes are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=30,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()
        wait_for_storage_pods()

        # Get MDS, rbd & cephfs plugin provisioner pods running
        # nodes post-recovery
        mds_running_nodes_after_recovery = list_of_nodes_running_pods(
            selector="rook-ceph-mds")

        rbd_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods(
            selector="csi-rbdplugin-provisioner")

        cephfs_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods(
            selector="csi-cephfsplugin-provisioner")

        assert len(set(mds_running_nodes_after_recovery)) == len(
            mds_running_nodes_after_recovery
        ), "MDS running on same node, Not expected!!!"
        log.info("MDS pods not running on same node")

        assert len(set(rbd_provisioner_running_nodes_after_recovery)) == len(
            rbd_provisioner_running_nodes_after_recovery
        ), "rbd plugin provisioner pods running on Same node, Not expected"
        log.info("RBD plugin provisioner pods not running on same node")

        assert len(
            set(cephfs_provisioner_running_nodes_after_recovery)
        ) == len(
            cephfs_provisioner_running_nodes_after_recovery
        ), "cephfs plugin provisioner pods running on Same node, Not expected"
        log.info("CEPHFS plugin provisioner pods not running on same node")
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        failure,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        new_ocs_node_names = add_new_node_and_label_it(machineset_name)
        failure_domain = get_failure_domain()
        log.info("Wait for the nodes racks or zones to appear...")
        wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names)

        new_ocs_node = get_node_objs(new_ocs_node_names)[0]
        osd_node_in_same_rack_or_zone = get_another_osd_node_in_same_rack_or_zone(
            failure_domain, new_ocs_node, common_nodes)
        # Get the failure node obj
        failure_node_obj = get_node_objs([osd_node_in_same_rack_or_zone.name])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods(timeout=300)

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            tries = 200
        else:
            tries = 40

        self.sanity_helpers.health_check(tries=tries)
Esempio n. 16
0
    def test_mcg_namespace_disruptions_crd(
        self,
        mcg_obj,
        cld_mgr,
        awscli_pod,
        bucketclass_dict,
        bucket_factory,
        node_drain_teardown,
    ):
        """
        Test MCG namespace disruption flow

        1. Create NS resources with CRDs
        2. Create NS bucket with CRDs
        3. Upload to NS bucket
        4. Delete noobaa related pods and verify integrity of objects
        5. Create public access policy on NS bucket and verify Get op
        6. Drain nodes containing noobaa pods and verify integrity of objects
        7. Perform put operation to validate public access denial
        7. Edit/verify and remove objects on NS bucket

        """
        data = "Sample string content to write to a S3 object"
        object_key = "ObjKey-" + str(uuid.uuid4().hex)
        awscli_node_name = awscli_pod.get()["spec"]["nodeName"]

        aws_s3_creds = {
            "access_key_id": cld_mgr.aws_client.access_key,
            "access_key": cld_mgr.aws_client.secret_key,
            "endpoint": constants.MCG_NS_AWS_ENDPOINT,
            "region": config.ENV_DATA["region"],
        }

        # S3 account details
        user_name = "nb-user" + str(uuid.uuid4().hex)
        email = user_name + "@mail.com"

        logger.info("Setting up test files for upload, to the bucket/resources")
        setup_base_objects(awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3)

        # Create the namespace resource and verify health
        ns_buc = bucket_factory(
            amount=1,
            interface=bucketclass_dict["interface"],
            bucketclass=bucketclass_dict,
        )[0]
        ns_bucket = ns_buc.name

        aws_target_bucket = ns_buc.bucketclass.namespacestores[0].uls_name

        logger.info(f"Namespace bucket: {ns_bucket} created")

        logger.info(f"Uploading objects to ns bucket: {ns_bucket}")
        sync_object_directory(
            awscli_pod,
            src=MCG_NS_ORIGINAL_DIR,
            target=f"s3://{ns_bucket}",
            s3_obj=mcg_obj,
        )

        for pod_to_respin in self.labels_map:
            logger.info(f"Re-spinning mcg resource: {self.labels_map[pod_to_respin]}")
            pod_obj = pod.Pod(
                **pod.get_pods_having_label(
                    label=self.labels_map[pod_to_respin],
                    namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                )[0]
            )

            pod_obj.delete(force=True)

            assert pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=self.labels_map[pod_to_respin],
                resource_count=1,
                timeout=300,
            )

            logger.info(
                f"Downloading objects from ns bucket: {ns_bucket} "
                f"after re-spinning: {self.labels_map[pod_to_respin]}"
            )
            sync_object_directory(
                awscli_pod,
                src=f"s3://{ns_bucket}",
                target=MCG_NS_RESULT_DIR,
                s3_obj=mcg_obj,
            )

            logger.info(
                f"Verifying integrity of objects "
                f"after re-spinning: {self.labels_map[pod_to_respin]}"
            )
            compare_directory(
                awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3
            )

        # S3 account
        user = NoobaaAccount(mcg_obj, name=user_name, email=email, buckets=[ns_bucket])
        logger.info(f"Noobaa account: {user.email_id} with S3 access created")

        # Admin sets Public access policy(*)
        bucket_policy_generated = gen_bucket_policy(
            user_list=["*"],
            actions_list=["GetObject"],
            resources_list=[f'{ns_bucket}/{"*"}'],
        )
        bucket_policy = json.dumps(bucket_policy_generated)

        logger.info(
            f"Creating bucket policy on bucket: {ns_bucket} with wildcard (*) Principal"
        )
        put_policy = put_bucket_policy(mcg_obj, ns_bucket, bucket_policy)
        logger.info(f"Put bucket policy response from Admin: {put_policy}")

        logger.info(f"Getting bucket policy on bucket: {ns_bucket}")
        get_policy = get_bucket_policy(mcg_obj, ns_bucket)
        logger.info(f"Got bucket policy: {get_policy['Policy']}")

        # MCG admin writes an object to bucket
        logger.info(f"Writing object on bucket: {ns_bucket} by admin")
        assert s3_put_object(mcg_obj, ns_bucket, object_key, data), "Failed: PutObject"

        # Verifying whether Get operation is allowed to any S3 user
        logger.info(
            f"Get object action on namespace bucket: {ns_bucket} "
            f"with user: {user.email_id}"
        )
        assert s3_get_object(user, ns_bucket, object_key), "Failed: GetObject"

        # Upload files to NS target
        logger.info(
            f"Uploading objects directly to ns resource target: {aws_target_bucket}"
        )
        sync_object_directory(
            awscli_pod,
            src=MCG_NS_ORIGINAL_DIR,
            target=f"s3://{aws_target_bucket}",
            signed_request_creds=aws_s3_creds,
        )

        for pod_to_drain in self.labels_map:
            pod_obj = pod.Pod(
                **pod.get_pods_having_label(
                    label=self.labels_map[pod_to_drain],
                    namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                )[0]
            )

            # Retrieve the node name on which the pod resides
            node_name = pod_obj.get()["spec"]["nodeName"]

            if awscli_node_name == node_name:
                logger.info(
                    f"Skipping node drain since aws cli pod node: "
                    f"{awscli_node_name} is same as {pod_to_drain} "
                    f"pod node: {node_name}"
                )
                continue

            # Drain the node
            drain_nodes([node_name])
            wait_for_nodes_status(
                [node_name], status=constants.NODE_READY_SCHEDULING_DISABLED
            )
            schedule_nodes([node_name])
            wait_for_nodes_status(timeout=300)

            # Retrieve the new pod
            pod_obj = pod.Pod(
                **pod.get_pods_having_label(
                    label=self.labels_map[pod_to_drain],
                    namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                )[0]
            )
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout=120)

            # Verify all storage pods are running
            wait_for_storage_pods()

            logger.info(
                f"Downloading objects from ns bucket: {ns_bucket} "
                f"after draining node: {node_name} with pod {pod_to_drain}"
            )
            sync_object_directory(
                awscli_pod,
                src=f"s3://{ns_bucket}",
                target=MCG_NS_RESULT_DIR,
                s3_obj=mcg_obj,
            )

            logger.info(
                f"Verifying integrity of objects "
                f"after draining node with pod: {pod_to_drain}"
            )
            compare_directory(
                awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3
            )

        logger.info(f"Editing the namespace resource bucket: {ns_bucket}")
        namespace_bucket_update(
            mcg_obj,
            bucket_name=ns_bucket,
            read_resource=[aws_target_bucket],
            write_resource=aws_target_bucket,
        )

        logger.info(f"Verifying object download after edit on ns bucket: {ns_bucket}")
        sync_object_directory(
            awscli_pod,
            src=f"s3://{ns_bucket}",
            target=MCG_NS_RESULT_DIR,
            s3_obj=mcg_obj,
        )

        # Verifying whether Put object action is denied
        logger.info(
            f"Verifying whether user: {user.email_id} has only public read access"
        )

        logger.info(f"Removing objects from ns bucket: {ns_bucket}")
        rm_object_recursive(awscli_pod, target=ns_bucket, mcg_obj=mcg_obj)
Esempio n. 17
0
    def test_noobaa_db_backup_and_recovery(
        self,
        pvc_factory,
        pod_factory,
        snapshot_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test case to verify noobaa backup and recovery

        1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC
        2. Scale down the statefulset noobaa-db
        3. Get the yaml of the current PVC, db-noobaa-db-0 and
           change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC
        4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed.
           The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’
        5. Edit again restore PV and remove the claimRef section.
           The volume will transition to Available.
        6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC.
        7. Scale up the stateful set again and the pod should be running

        """

        # Initialise variable
        self.noobaa_db_sst_name = "noobaa-db-pg"

        # Get noobaa pods before execution
        noobaa_pods = get_noobaa_pods()

        # Get noobaa PVC before execution
        noobaa_pvc_obj = get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"])
        noobaa_pv_name = noobaa_pvc_obj[0].get("spec").get("spec").get(
            "volumeName")

        # Take snapshot db-noobaa-db-0 PVC
        log.info(f"Creating snapshot of the {noobaa_pvc_obj[0].name} PVC")
        snap_obj = snapshot_factory(
            pvc_obj=noobaa_pvc_obj[0],
            wait=True,
            snapshot_name=f"{noobaa_pvc_obj[0].name}-snapshot",
        )
        log.info(
            f"Successfully created snapshot {snap_obj.name} and in Ready state"
        )

        # Restore it to PVC
        log.info(f"Restoring snapshot {snap_obj.name} to create new PVC")
        sc_name = noobaa_pvc_obj[0].get().get("spec").get("storageClassName")
        pvc_size = (noobaa_pvc_obj[0].get().get("spec").get("resources").get(
            "requests").get("storage"))
        self.restore_pvc_obj = create_restore_pvc(
            sc_name=sc_name,
            snap_name=snap_obj.name,
            namespace=snap_obj.namespace,
            size=pvc_size,
            pvc_name=f"{snap_obj.name}-restore",
            volume_mode=snap_obj.parent_volume_mode,
            access_mode=snap_obj.parent_access_mode,
        )
        wait_for_resource_state(self.restore_pvc_obj, constants.STATUS_BOUND)
        self.restore_pvc_obj.reload()
        log.info(f"Succeesfuly created PVC {self.restore_pvc_obj.name} "
                 f"from snapshot {snap_obj.name}")

        # Scale down the statefulset noobaa-db
        modify_statefulset_replica_count(
            statefulset_name=self.noobaa_db_sst_name, replica_count=0
        ), f"Failed to scale down the statefulset {self.noobaa_db_sst_name}"

        # Get the noobaa-db PVC
        pvc_obj = OCP(kind=constants.PVC,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        noobaa_pvc_yaml = pvc_obj.get(resource_name=noobaa_pvc_obj[0].name)

        # Get the restored noobaa PVC and
        # change the parameter persistentVolumeReclaimPolicy to Retain
        restored_noobaa_pvc_obj = get_pvc_objs(
            pvc_names=[f"{snap_obj.name}-restore"])
        restored_noobaa_pv_name = (restored_noobaa_pvc_obj[0].get("spec").get(
            "spec").get("volumeName"))
        pv_obj = OCP(kind=constants.PV,
                     namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        params = '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}'
        assert pv_obj.patch(
            resource_name=restored_noobaa_pv_name, params=params), (
                "Failed to change the parameter persistentVolumeReclaimPolicy"
                f" to Retain {restored_noobaa_pv_name}")

        # Delete both PVCs
        delete_pvcs(pvc_objs=[noobaa_pvc_obj[0], restored_noobaa_pvc_obj[0]])

        # Validate original claim db-noobaa-db-0 removed
        assert validate_pv_delete(
            pv_name=noobaa_pv_name
        ), f"PV not deleted, still exist {noobaa_pv_name}"

        # Validate PV for claim db-noobaa-db-0-snapshot-restore is in Released state
        pv_obj.wait_for_resource(condition=constants.STATUS_RELEASED,
                                 resource_name=restored_noobaa_pv_name)

        # Edit again restore PV and remove the claimRef section
        log.info(
            f"Remove the claimRef section from PVC {restored_noobaa_pv_name}")
        params = '[{"op": "remove", "path": "/spec/claimRef"}]'
        pv_obj.patch(resource_name=restored_noobaa_pv_name,
                     params=params,
                     format_type="json")
        log.info(
            f"Successfully removed claimRef section from PVC {restored_noobaa_pv_name}"
        )

        # Validate PV is in Available state
        pv_obj.wait_for_resource(condition=constants.STATUS_AVAILABLE,
                                 resource_name=restored_noobaa_pv_name)

        # Edit the yaml db-noobaa-db-0.yaml and change the
        # setting volumeName to restored PVC
        noobaa_pvc_yaml["spec"]["volumeName"] = restored_noobaa_pv_name
        noobaa_pvc_yaml = OCS(**noobaa_pvc_yaml)
        noobaa_pvc_yaml.create()

        # Validate noobaa PVC is in bound state
        pvc_obj.wait_for_resource(
            condition=constants.STATUS_BOUND,
            resource_name=noobaa_pvc_obj[0].name,
            timeout=120,
        )

        # Scale up the statefulset again
        assert modify_statefulset_replica_count(
            statefulset_name=self.noobaa_db_sst_name, replica_count=1
        ), f"Failed to scale up the statefulset {self.noobaa_db_sst_name}"

        # Validate noobaa pod is up and running
        pod_obj = OCP(kind=constants.POD,
                      namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_count=len(noobaa_pods),
            selector=constants.NOOBAA_APP_LABEL,
        )

        # Change the parameter persistentVolumeReclaimPolicy to Delete again
        params = '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}'
        assert pv_obj.patch(
            resource_name=restored_noobaa_pv_name, params=params), (
                "Failed to change the parameter persistentVolumeReclaimPolicy"
                f" to Delete {restored_noobaa_pv_name}")
        log.info(
            "Changed the parameter persistentVolumeReclaimPolicy to Delete again"
        )

        # Verify all storage pods are running
        wait_for_storage_pods()

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
    def test_del_mon_svc(
        self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown
    ):
        """
        Test to verify same mon comes up and running
        after deleting mon services manually and joins the quorum

        1. Delete the mon services
        2. Restart the rook operator
        3. Make sure all mon pods are running,
        and same service or endpoints are running
        4. Make sure ceph health Ok and storage pods are running
        5. Create PVC, should succeeded.

        """

        self.sanity_helpers = Sanity()

        # Get all mon services
        mon_svc_before = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()

        # Delete the mon services one by one
        svc_obj = OCP(
            kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
        )
        mon_svc_ip_before = []
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            mon_svc_ip_before.append(svc["spec"]["clusterIP"])
            log.info(f"Delete mon service {svc_name}")
            svc_obj.delete(resource_name=svc_name)
            # Verify mon services deleted
            svc_obj.wait_for_delete(resource_name=svc_name)

        # Restart the rook-operator pod
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL
        )

        # Verify same mon services are created again
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            svc_obj.check_resource_existence(
                should_exist=True, timeout=300, resource_name=svc_name
            )
        log.info("Same old mon services are recreated")

        # Validate all mons are running
        log.info("Validate all mons are up and running")
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=len(mon_pods),
            timeout=600,
            sleep=3,
        )

        # Validate same mon services are running
        log.info("Validate same mon services are running")
        mon_svc_after = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after]
        assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, (
            "Different mon services are running. "
            f"Before mon services list: {mon_svc_ip_before}, "
            f"After mon services list: {mon_svc_ip_after}"
        )
        log.info("Same old mon services are running and all mons are in running state")

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Create and delete resources
        self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
Esempio n. 19
0
    def test_pdb_check_simultaneous_node_drains(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_drain_teardown,
    ):
        """
        - Check for OSD PDBs before drain
        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
        - Drain will be completed on worker node A
        - Drain will be pending on worker node B due to blocking PDBs
        - Check the OSD PDBs
        - Mark the node A as schedulable
        - Let drain finish on Node B
        - Mark the node B as schedulable
        - Check cluster and Ceph health

        """

        # Validate OSD PDBs before drain operation
        assert (not validate_existence_of_blocking_pdb()
                ), "Blocking PDBs exist, Can't perform drain"

        # Get 2 worker nodes to drain
        typed_nodes = get_nodes(num_of_nodes=2)
        assert len(
            typed_nodes) == 2, "Failed to find worker nodes for the test"
        node_A = typed_nodes[0].name
        node_B = typed_nodes[1].name

        # Drain Node A and validate blocking PDBs
        drain_nodes([node_A])
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post drain"

        # Inducing delay between 2 drains
        # Node-B drain expected to be in pending due to blocking PDBs
        time.sleep(30)
        try:
            drain_nodes([node_B])
        except TimeoutExpired:
            # Mark the node-A back to schedulable and let drain finish in Node-B
            schedule_nodes([node_A])

        time.sleep(40)

        # Validate OSD PDBs
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post second drain"

        # Mark the node-B back to schedulable and recover the cluster
        schedule_nodes([node_B])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=validate_existence_of_blocking_pdb,
        )
        if not sample.wait_for_func_status(result=False):
            log.error("Blocking PDBs still exist")

        # wait for storage pods
        pod.wait_for_storage_pods()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=50)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()
    def test_all_worker_nodes_short_network_failure(self, nodes, setup,
                                                    node_restart_teardown):
        """
        OCS-1432/OCS-1433:
        - Start DeploymentConfig based app pods
        - Make all the worker nodes unresponsive by doing abrupt network failure
        - Reboot the unresponsive node after short duration of ~300 seconds
        - When unresponsive node recovers, app pods and ceph cluster should recover
        - Again run IOs from app pods
        """
        pod_objs = setup
        worker_nodes = node.get_worker_nodes()

        # Run IO on pods
        logger.info(f"Starting IO on {len(pod_objs)} app pods")
        with ThreadPoolExecutor() as executor:
            for pod_obj in pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="2G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f1",
                )

        logger.info(f"IO started on all {len(pod_objs)} app pods")

        # Wait for IO results
        for pod_obj in pod_objs:
            pod.get_fio_rw_iops(pod_obj)

        # Induce network failure on all worker nodes
        with ThreadPoolExecutor() as executor:
            for node_name in worker_nodes:
                executor.submit(node.node_network_failure, node_name, False)

        node.wait_for_nodes_status(node_names=worker_nodes,
                                   status=constants.NODE_NOT_READY)

        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Reboot the worker nodes
        logger.info(f"Stop and start the worker nodes: {worker_nodes}")
        nodes.restart_nodes_by_stop_and_start(node.get_node_objs(worker_nodes))

        try:
            node.wait_for_nodes_status(node_names=worker_nodes,
                                       status=constants.NODE_READY)
            logger.info(
                "Verifying StorageCluster pods are in running/completed state")
            pod.wait_for_storage_pods(timeout=720)
        except ResourceWrongStatusException:
            # Restart nodes
            nodes.restart_nodes(node.get_node_objs(worker_nodes))

        assert ceph_health_check(tries=80), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

        # Get current info of app pods
        new_pod_objs = list()
        for pod_obj in pod_objs:
            pod_label = pod_obj.labels.get("deploymentconfig")
            pods_data = pod.get_pods_having_label(
                f"deploymentconfig={pod_label}", pod_obj.namespace)
            current_pods = [
                pod_data.get("metadata").get("name") for pod_data in pods_data
                if "-deploy" not in pod_data.get("metadata").get("name")
            ]
            logger.info(f"Pods with label {pod_label}: {current_pods}")

            # Remove the older pod from the list if pod is rescheduled
            if len(current_pods) > 1:
                current_pods.remove(pod_obj.name)

            new_pod_obj = pod.get_pod_obj(current_pods.pop(),
                                          pod_obj.namespace)
            new_pod_obj.pvc = pod_obj.pvc
            new_pod_objs.append(new_pod_obj)

        logger.info("Wait for app pods are in running state")
        for pod_obj in new_pod_objs:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=720,
                sleep=20,
            )
        logger.info("All the app pods reached running state")

        # Run more IOs on app pods
        with ThreadPoolExecutor() as executor:
            for pod_obj in new_pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                pod_obj.wl_setup_done = False
                storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="1G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f2",
                )

        for pod_obj in new_pod_objs:
            pod.get_fio_rw_iops(pod_obj)
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            svc_name = svc["metadata"]["name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            mon_info = del_obj.get(resource_name=svc_name)
            del_obj.delete(resource_name=svc_name)

            # Delete pvc
            if is_lso_cluster():
                mon_data_path = f"/var/lib/rook/mon-{mon_id}"
                mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][
                    "kubernetes.io/hostname"
                ]
                log.info(f"Delete the directory `{mon_data_path}` from {mon_node}")
                cmd = f"rm -rf {mon_data_path}"
                ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
                ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd])
            else:
                log.info("Delete mon PVC")
                pvc_name = svc["metadata"]["labels"]["pvc_name"]
                pvc_obj = OCP(
                    kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
                )
                pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(
                kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            svc_obj.delete(resource_name=svc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS
            )
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "")
                if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1
                else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""
                )
            )
            new_data["data"] = ",".join(
                [
                    value
                    for value in new_data["data"].split(",")
                    if f"{mon_id}=" not in value
                ]
            )
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1
                else new_data["mapping"].replace(f',"{mon_id}":null', "")
            )
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=1
            ), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(f"Waiting for {sleep_time} seconds before deleting another mon")
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}"
        )
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")
Esempio n. 23
0
    def test_rgw_host_node_failure(self, nodes, node_restart_teardown,
                                   node_drain_teardown, mcg_obj,
                                   bucket_factory):
        """
        Test case to fail node where RGW and the NooBaa DB are hosted
        and verify the new pods spin on a healthy node

        """

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        noobaa_pod_node = None
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name in [
                    constants.NB_DB_NAME_46_AND_BELOW,
                    constants.NB_DB_NAME_47_AND_ABOVE,
            ]:
                noobaa_pod_node = get_pod_node(noobaa_pod)
        if noobaa_pod_node is None:
            assert False, "Could not find the NooBaa DB pod"

        # Validate if RGW pod and noobaa-db are hosted on same node
        # If not, make sure both pods are hosted on same node
        log.info("Validate if RGW pod and noobaa-db are hosted on same node")
        rgw_pod_obj = get_rgw_pods()
        rgw_pod_node_list = [
            rgw_pod.get().get("spec").get("nodeName")
            for rgw_pod in rgw_pod_obj
        ]
        if not list(
                set(rgw_pod_node_list).intersection(
                    noobaa_pod_node.name.split())):
            log.info("Unschedule other two nodes such that RGW "
                     "pod moves to node where NooBaa DB pod hosted")
            worker_node_list = get_worker_nodes()
            node_names = list(
                set(worker_node_list) - set(noobaa_pod_node.name.split()))
            unschedule_nodes(node_names=node_names)
            ocp_obj = OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            rgw_pod_obj[0].delete()
            ocp_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_count=len(rgw_pod_obj),
                selector=constants.RGW_APP_LABEL,
                timeout=300,
                sleep=5,
            )
            log.info("Schedule those nodes again")
            schedule_nodes(node_names=node_names)

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Verify all storage pods are running
            wait_for_storage_pods()

            # Check again the rgw pod move to node where NooBaa DB pod hosted
            rgw_pod_obj_list = get_rgw_pods()
            rgw_pod_node_list = [
                get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list
            ]
            value = [
                True if rgw_pod_node == noobaa_pod_node.name else False
                for rgw_pod_node in rgw_pod_node_list
            ]
            assert value, ("RGW Pod didn't move to node where NooBaa DB pod"
                           " hosted even after cordoned and uncordoned nodes"
                           f"RGW pod hosted: {rgw_pod_node_list}"
                           f"NooBaa DB pod hosted: {noobaa_pod_node.name}")

        log.info(
            "RGW and noobaa-db are hosted on same node start the test execution"
        )
        rgw_pod_obj = get_rgw_pods()
        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(f"Stopping node {pod_node} where"
                         f" rgw pod {rgw_pod.name} and NooBaa DB are hosted")
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(resource=rgw_pod,
                                        state=constants.STATUS_TERMINATING,
                                        timeout=720)

                # Validate new rgw pod spun
                ocp_obj = OCP(kind=constants.POD,
                              namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Start the node
                nodes.start_nodes(node_obj)

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

                # Verify all storage pods are running
                wait_for_storage_pods()

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj,
                                         "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()
Esempio n. 24
0
    def test_pdb_check_simultaneous_node_drains(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_drain_teardown,
    ):
        """
        - Check for OSD PDBs before drain
        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
        - Drain will be completed on worker node A
        - Drain will be pending on worker node B due to blocking PDBs
        - Check mon failover in first 10 mins, then 15 and 20 mins
        - Check the OSD PDBs
        - Mark the node A as schedulable
        - Let drain finish on Node B
        - Again check mon failover in first 10 mins and then in intervals
        - Mark the node B as schedulable
        - Check cluster and Ceph health

        """

        # Validate OSD PDBs before drain operation
        assert (not validate_existence_of_blocking_pdb()
                ), "Blocking PDBs exist, Can't perform drain"

        # Get 2 worker nodes to drain
        typed_nodes = get_nodes(num_of_nodes=2)
        assert len(
            typed_nodes) == 2, "Failed to find worker nodes for the test"
        node_A = typed_nodes[0].name
        node_B = typed_nodes[1].name

        # Drain Node A and validate blocking PDBs
        drain_nodes([node_A])
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post drain"

        # Inducing delay between 2 drains
        # Node-B drain expected to be in pending due to blocking PDBs
        time.sleep(30)
        try:
            drain_nodes([node_B])
            # After the drain check Mon failover in 10th, 15th and 20th min
            timeout = [600, 300, 300]
            for failover in timeout:
                sample = TimeoutSampler(
                    timeout=failover,
                    sleep=10,
                    func=helpers.check_number_of_mon_pods,
                )
                if not sample.wait_for_func_status(result=True):
                    assert "Number of mon pods not equal to expected_mon_count=3"
        except TimeoutExpired:
            # Mark the node-A back to schedulable and let drain finish in Node-B
            schedule_nodes([node_A])

        time.sleep(40)

        # Validate OSD PDBs
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post second drain"

        # Mark the node-B back to schedulable and recover the cluster
        schedule_nodes([node_B])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=validate_existence_of_blocking_pdb,
        )
        if not sample.wait_for_func_status(result=False):
            log.error("Blocking PDBs still exist")

        # After the drain check mon failover in 10th, 15th and 20th Min
        timeout = [600, 300, 300]
        for failover in timeout:
            sample = TimeoutSampler(
                timeout=failover,
                sleep=10,
                func=helpers.check_number_of_mon_pods,
            )
            if not sample.wait_for_func_status(result=True):
                assert "Number of Mon pods not equal to expected_mon_count=3"

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "The expected mon-pdb is not equal to actual mon pdb"

        # wait for storage pods
        pod.wait_for_storage_pods()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=50)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()