Esempio n. 1
0
    def test_pvc_disruptive(
        self,
        interface,
        operation_to_disrupt,
        resource_to_delete,
        multi_pvc_factory,
        pod_factory,
    ):
        """
        Base function for PVC disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            "mds":
            partial(pod.get_mds_pods),
            "mon":
            partial(pod.get_mon_pods),
            "mgr":
            partial(pod.get_mgr_pods),
            "osd":
            partial(pod.get_osd_pods),
            "rbdplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(pod.get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner":
            partial(pod.get_rbdfsplugin_provisioner_pods),
            "operator":
            partial(pod.get_operator_pods),
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        num_of_pvc = 12
        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        DISRUPTION_OPS.set_resource(resource=resource_to_delete)

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend([
                f"{constants.ACCESS_MODE_RWO}-Block",
                f"{constants.ACCESS_MODE_RWX}-Block",
            ])

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=self.proj_obj,
            size=5,
            access_modes=access_modes,
            access_modes_selection="distribute_random",
            status=constants.STATUS_BOUND,
            num_of_pvc=num_of_pvc,
            wait_each=False,
            timeout=90,
        )

        if operation_to_disrupt == "create_pvc":
            # Ensure PVCs are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                get_all_pvcs, initial_num_of_pvc, namespace, "increase")
            assert ret, "Wait timeout: PVCs are not being created."
            logger.info("PVCs creation has started.")
            DISRUPTION_OPS.delete_resource()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)
            pvc_obj.reload()
        logger.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs,
                                          pod_factory, interface, 2)

        if operation_to_disrupt == "create_pod":
            # Ensure that pods are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                pod.get_all_pods, initial_num_of_pods, namespace, "increase")
            assert ret, "Wait timeout: Pods are not being created."
            logger.info("Pods creation has started.")
            DISRUPTION_OPS.delete_resource()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        logger.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        logger.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            logger.info(
                f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj,
                                         "wl_setup_done"):
                if sample:
                    logger.info(f"Setup for running IO is completed on pod "
                                f"{pod_obj.name}.")
                    break
        logger.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file1",
            )
        logger.info("FIO started on all pods.")

        if operation_to_disrupt == "run_io":
            DISRUPTION_OPS.delete_resource()

        logger.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        logger.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2)

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        logging.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file2",
            )

        logger.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        logger.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        logger.info("Ceph cluster health is OK")
    def test_delete_rbd_pvc_while_thick_provisioning(
        self,
        resource_to_delete,
        pvc_factory,
        pod_factory,
    ):
        """
        Test to delete RBD PVC while thick provisioning is progressing and verify that no stale image is present.
        Based on the value of "resource_to_delete", provisioner pod also will be deleted.
        """
        pvc_size = 15
        executor = ThreadPoolExecutor(max_workers=1)

        if resource_to_delete:
            DISRUPTION_OPS.set_resource(resource=resource_to_delete,
                                        leader_type="provisioner")

        ct_pod = get_ceph_tools_pod()

        # Collect the list of RBD images
        image_list_out_initial = ct_pod.exec_ceph_cmd(
            ceph_cmd=f"rbd ls -p {constants.DEFAULT_BLOCKPOOL}", format="")
        image_list_initial = image_list_out_initial.strip().split()
        log.info(
            f"List of RBD images before creating the PVC {image_list_initial}")

        # Start creation of PVC
        pvc_obj = pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            project=self.proj_obj,
            storageclass=default_thick_storage_class(),
            size=pvc_size,
            access_mode=constants.ACCESS_MODE_RWO,
            status="",
        )

        # Ensure that the PVC is being created
        ret = wait_for_resource_count_change(get_all_pvcs, 0,
                                             self.proj_obj.namespace,
                                             "increase")
        assert ret, "Wait timeout: PVC is not being created."
        log.info("PVC creation has started.")

        if resource_to_delete:
            log.info(f"Deleting {resource_to_delete} pod.")
            delete_provisioner = executor.submit(
                DISRUPTION_OPS.delete_resource)

        # Delete PVC
        log.info(f"Deleting PVC {pvc_obj.name}")
        pvc_obj.delete()
        pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        log.info(f"Verified: PVC {pvc_obj.name} is deleted.")

        if resource_to_delete:
            delete_provisioner.result()

        # Collect the list of RBD images
        image_list_out_final = ct_pod.exec_ceph_cmd(
            ceph_cmd=f"rbd ls -p {default_ceph_block_pool()}", format="")
        image_list_final = image_list_out_final.strip().split()
        log.info(
            f"List of RBD images after deleting the PVC {image_list_final}")

        stale_images = [
            image for image in image_list_final
            if image not in image_list_initial
        ]

        # Check whether more than one new image is present
        if len(stale_images) > 1:
            raise UnexpectedBehaviour(
                f"Could not verify the test result. Found more than one new rbd image - {stale_images}."
            )

        if stale_images:
            stale_image = stale_images[0].strip()
            # Wait for the image to get deleted
            image_deleted = verify_volume_deleted_in_backend(
                constants.CEPHBLOCKPOOL,
                image_uuid=stale_image.split("csi-vol-")[1],
                pool_name=default_ceph_block_pool(),
                timeout=300,
            )
            if not image_deleted:
                du_out = ct_pod.exec_ceph_cmd(
                    ceph_cmd=
                    f"rbd du -p {default_ceph_block_pool()} {stale_image}",
                    format="",
                )
            assert image_deleted, (
                f"Wait timeout: RBD image {stale_image} is not deleted. Check the logs to ensure that"
                f" this is the stale image of the deleted PVC. rbd du output of the image : {du_out}"
            )
            log.info(
                f"Image {stale_image} deleted within the wait time period")
        else:
            log.info("No stale image found")
    def test_delete_provisioner_pod_while_thick_provisioning(
        self,
        pvc_factory,
        pod_factory,
    ):
        """
        Test to delete RBD provisioner leader pod while creating a PVC using thick provision enabled storage class
        """
        pvc_size = 20
        pool_name = default_ceph_block_pool()
        executor = ThreadPoolExecutor(max_workers=1)
        DISRUPTION_OPS.set_resource(resource="rbdplugin_provisioner",
                                    leader_type="provisioner")

        # Start creation of PVC
        pvc_create = executor.submit(
            pvc_factory,
            interface=constants.CEPHBLOCKPOOL,
            project=self.proj_obj,
            storageclass=default_thick_storage_class(),
            size=pvc_size,
            access_mode=constants.ACCESS_MODE_RWO,
            status="",
        )

        # Ensure that the PVC is being created before deleting the rbd provisioner pod
        ret = helpers.wait_for_resource_count_change(get_all_pvcs, 0,
                                                     self.proj_obj.namespace,
                                                     "increase")
        assert ret, "Wait timeout: PVC is not being created."
        logger.info("PVC creation has started.")
        DISRUPTION_OPS.delete_resource()
        logger.info("Deleted RBD provisioner leader pod.")

        pvc_obj = pvc_create.result()

        # Confirm that the PVC is Bound
        helpers.wait_for_resource_state(resource=pvc_obj,
                                        state=constants.STATUS_BOUND,
                                        timeout=600)
        pvc_obj.reload()
        logger.info(f"Verified: PVC {pvc_obj.name} reached Bound state.")
        image_name = pvc_obj.get_rbd_image_name
        pv_obj = pvc_obj.backed_pv_obj

        # Verify thick provision by checking the image used size
        assert check_rbd_image_used_size(
            pvc_objs=[pvc_obj],
            usage_to_compare=f"{pvc_size}GiB",
            rbd_pool=pool_name,
            expect_match=True,
        ), f"PVC {pvc_obj.name} is not thick provisioned.\n PV describe :\n {pv_obj.describe()}"
        logger.info("Verified: The PVC is thick provisioned")

        # Create pod and run IO
        pod_obj = pod_factory(
            interface=constants.CEPHBLOCKPOOL,
            pvc=pvc_obj,
            status=constants.STATUS_RUNNING,
        )
        pod_obj.run_io(
            storage_type="fs",
            size=f"{pvc_size-1}G",
            fio_filename=f"{pod_obj.name}_io",
            end_fsync=1,
        )

        # Get IO result
        get_fio_rw_iops(pod_obj)

        logger.info(f"Deleting pod {pod_obj.name}")
        pod_obj.delete()
        pod_obj.ocp.wait_for_delete(pod_obj.name,
                                    180), f"Pod {pod_obj.name} is not deleted"

        # Fetch image id for verification
        image_uid = pvc_obj.image_uuid

        logger.info(f"Deleting PVC {pvc_obj.name}")
        pvc_obj.delete()
        pvc_obj.ocp.wait_for_delete(
            pvc_obj.name), f"PVC {pvc_obj.name} is not deleted"
        logger.info(f"Verified: PVC {pvc_obj.name} is deleted.")
        pv_obj.ocp.wait_for_delete(
            pv_obj.name), f"PV {pv_obj.name} is not deleted"
        logger.info(f"Verified: PV {pv_obj.name} is deleted.")

        # Verify the rbd image is deleted
        logger.info(f"Wait for the RBD image {image_name} to get deleted")
        assert verify_volume_deleted_in_backend(
            interface=constants.CEPHBLOCKPOOL,
            image_uuid=image_uid,
            pool_name=pool_name,
            timeout=300,
        ), f"Wait timeout - RBD image {image_name} is not deleted"
        logger.info(f"Verified: RBD image {image_name} is deleted")
    def disruptive_base(self, interface, operation_to_disrupt,
                        resource_to_delete):
        """
        Base function for disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=1)

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=self.namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=self.namespace)["items"])

        # Fetch PV names
        pv_objs = []
        for pvc_obj in self.pvc_objs:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in self.pod_objs:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"]
            for pvc_obj in self.pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in self.pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                pod_obj.pvc.storage_type = "block"
            else:
                pod_obj.pvc.storage_type = "fs"
            pod_obj.workload_setup(storage_type=pod_obj.pvc.storage_type)
        log.info("Setup for running IO is completed on pods")

        # Start IO on each pod. RWX PVC will be used on two pods. So split the
        # size accordingly
        log.info("Starting IO on pods")
        for pod_obj in self.pod_objs:
            if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX:
                io_size = int((self.pvc_size - 1) / 2)
            else:
                io_size = self.pvc_size - 1
            pod_obj.run_io(
                storage_type=pod_obj.pvc.storage_type,
                size=f"{io_size}G",
                fio_filename=f"{pod_obj.name}_io",
                end_fsync=1,
            )
        log.info("IO started on all pods.")

        # Start deleting pods
        pod_bulk_delete = executor.submit(delete_pods,
                                          self.pod_objs,
                                          wait=False)

        if operation_to_disrupt == "delete_pods":
            ret = wait_for_resource_count_change(
                get_all_pods,
                initial_num_of_pods,
                self.namespace,
                "decrease",
                timeout=50,
            )
            assert ret, "Wait timeout: Pods are not being deleted."
            log.info("Pods deletion has started.")
            disruption.delete_resource()

        pod_bulk_delete.result()

        # Verify pods are deleted
        for pod_obj in self.pod_objs:
            assert pod_obj.ocp.wait_for_delete(
                pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted"
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        for node, pvs in node_pv_dict.items():
            cmd = f"oc debug nodes/{node} -- df"
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods")

        # Fetch image uuid associated with PVCs
        pvc_uuid_map = {}
        for pvc_obj in self.pvc_objs:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs)

        if operation_to_disrupt == "delete_pvcs":
            ret = wait_for_resource_count_change(get_all_pvcs,
                                                 initial_num_of_pvc,
                                                 self.namespace,
                                                 "decrease",
                                                 timeout=50)
            assert ret, "Wait timeout: PVCs are not being deleted."
            log.info("PVCs deletion has started.")
            disruption.delete_resource()

        pvcs_deleted = pvc_bulk_delete.result()

        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in self.pvc_objs:
            assert pvc_obj.ocp.wait_for_delete(
                pvc_obj.name), f"PVC {pvc_obj.name} is not deleted"
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            assert pv_obj.ocp.wait_for_delete(
                pv_obj.name, 120), f"PV {pv_obj.name} is not deleted"
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid,
                                                       pool_name=pool_name)
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid)
            assert ret, (f"Volume associated with PVC {pvc_name} still exists "
                         f"in backend")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")