def test_pvc_disruptive( self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory, ): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { "mds": partial(pod.get_mds_pods), "mon": partial(pod.get_mon_pods), "mgr": partial(pod.get_mgr_pods), "osd": partial(pod.get_osd_pods), "rbdplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(pod.get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(pod.get_rbdfsplugin_provisioner_pods), "operator": partial(pod.get_operator_pods), } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) num_of_pvc = 12 namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) DISRUPTION_OPS.set_resource(resource=resource_to_delete) access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend([ f"{constants.ACCESS_MODE_RWO}-Block", f"{constants.ACCESS_MODE_RWX}-Block", ]) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=5, access_modes=access_modes, access_modes_selection="distribute_random", status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False, timeout=90, ) if operation_to_disrupt == "create_pvc": # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, "increase") assert ret, "Wait timeout: PVCs are not being created." logger.info("PVCs creation has started.") DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120) pvc_obj.reload() logger.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs, pod_factory, interface, 2) if operation_to_disrupt == "create_pod": # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, "increase") assert ret, "Wait timeout: Pods are not being created." logger.info("Pods creation has started.") DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() logger.info("Verified: All pods are Running.") # Do setup on pods for running IO logger.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: logger.info( f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"): if sample: logger.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break logger.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file1", ) logger.info("FIO started on all pods.") if operation_to_disrupt == "run_io": DISRUPTION_OPS.delete_resource() logger.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" logger.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() logging.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) logger.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" logger.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) logger.info("Ceph cluster health is OK")
def test_delete_rbd_pvc_while_thick_provisioning( self, resource_to_delete, pvc_factory, pod_factory, ): """ Test to delete RBD PVC while thick provisioning is progressing and verify that no stale image is present. Based on the value of "resource_to_delete", provisioner pod also will be deleted. """ pvc_size = 15 executor = ThreadPoolExecutor(max_workers=1) if resource_to_delete: DISRUPTION_OPS.set_resource(resource=resource_to_delete, leader_type="provisioner") ct_pod = get_ceph_tools_pod() # Collect the list of RBD images image_list_out_initial = ct_pod.exec_ceph_cmd( ceph_cmd=f"rbd ls -p {constants.DEFAULT_BLOCKPOOL}", format="") image_list_initial = image_list_out_initial.strip().split() log.info( f"List of RBD images before creating the PVC {image_list_initial}") # Start creation of PVC pvc_obj = pvc_factory( interface=constants.CEPHBLOCKPOOL, project=self.proj_obj, storageclass=default_thick_storage_class(), size=pvc_size, access_mode=constants.ACCESS_MODE_RWO, status="", ) # Ensure that the PVC is being created ret = wait_for_resource_count_change(get_all_pvcs, 0, self.proj_obj.namespace, "increase") assert ret, "Wait timeout: PVC is not being created." log.info("PVC creation has started.") if resource_to_delete: log.info(f"Deleting {resource_to_delete} pod.") delete_provisioner = executor.submit( DISRUPTION_OPS.delete_resource) # Delete PVC log.info(f"Deleting PVC {pvc_obj.name}") pvc_obj.delete() pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info(f"Verified: PVC {pvc_obj.name} is deleted.") if resource_to_delete: delete_provisioner.result() # Collect the list of RBD images image_list_out_final = ct_pod.exec_ceph_cmd( ceph_cmd=f"rbd ls -p {default_ceph_block_pool()}", format="") image_list_final = image_list_out_final.strip().split() log.info( f"List of RBD images after deleting the PVC {image_list_final}") stale_images = [ image for image in image_list_final if image not in image_list_initial ] # Check whether more than one new image is present if len(stale_images) > 1: raise UnexpectedBehaviour( f"Could not verify the test result. Found more than one new rbd image - {stale_images}." ) if stale_images: stale_image = stale_images[0].strip() # Wait for the image to get deleted image_deleted = verify_volume_deleted_in_backend( constants.CEPHBLOCKPOOL, image_uuid=stale_image.split("csi-vol-")[1], pool_name=default_ceph_block_pool(), timeout=300, ) if not image_deleted: du_out = ct_pod.exec_ceph_cmd( ceph_cmd= f"rbd du -p {default_ceph_block_pool()} {stale_image}", format="", ) assert image_deleted, ( f"Wait timeout: RBD image {stale_image} is not deleted. Check the logs to ensure that" f" this is the stale image of the deleted PVC. rbd du output of the image : {du_out}" ) log.info( f"Image {stale_image} deleted within the wait time period") else: log.info("No stale image found")
def test_delete_provisioner_pod_while_thick_provisioning( self, pvc_factory, pod_factory, ): """ Test to delete RBD provisioner leader pod while creating a PVC using thick provision enabled storage class """ pvc_size = 20 pool_name = default_ceph_block_pool() executor = ThreadPoolExecutor(max_workers=1) DISRUPTION_OPS.set_resource(resource="rbdplugin_provisioner", leader_type="provisioner") # Start creation of PVC pvc_create = executor.submit( pvc_factory, interface=constants.CEPHBLOCKPOOL, project=self.proj_obj, storageclass=default_thick_storage_class(), size=pvc_size, access_mode=constants.ACCESS_MODE_RWO, status="", ) # Ensure that the PVC is being created before deleting the rbd provisioner pod ret = helpers.wait_for_resource_count_change(get_all_pvcs, 0, self.proj_obj.namespace, "increase") assert ret, "Wait timeout: PVC is not being created." logger.info("PVC creation has started.") DISRUPTION_OPS.delete_resource() logger.info("Deleted RBD provisioner leader pod.") pvc_obj = pvc_create.result() # Confirm that the PVC is Bound helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=600) pvc_obj.reload() logger.info(f"Verified: PVC {pvc_obj.name} reached Bound state.") image_name = pvc_obj.get_rbd_image_name pv_obj = pvc_obj.backed_pv_obj # Verify thick provision by checking the image used size assert check_rbd_image_used_size( pvc_objs=[pvc_obj], usage_to_compare=f"{pvc_size}GiB", rbd_pool=pool_name, expect_match=True, ), f"PVC {pvc_obj.name} is not thick provisioned.\n PV describe :\n {pv_obj.describe()}" logger.info("Verified: The PVC is thick provisioned") # Create pod and run IO pod_obj = pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=pvc_obj, status=constants.STATUS_RUNNING, ) pod_obj.run_io( storage_type="fs", size=f"{pvc_size-1}G", fio_filename=f"{pod_obj.name}_io", end_fsync=1, ) # Get IO result get_fio_rw_iops(pod_obj) logger.info(f"Deleting pod {pod_obj.name}") pod_obj.delete() pod_obj.ocp.wait_for_delete(pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted" # Fetch image id for verification image_uid = pvc_obj.image_uuid logger.info(f"Deleting PVC {pvc_obj.name}") pvc_obj.delete() pvc_obj.ocp.wait_for_delete( pvc_obj.name), f"PVC {pvc_obj.name} is not deleted" logger.info(f"Verified: PVC {pvc_obj.name} is deleted.") pv_obj.ocp.wait_for_delete( pv_obj.name), f"PV {pv_obj.name} is not deleted" logger.info(f"Verified: PV {pv_obj.name} is deleted.") # Verify the rbd image is deleted logger.info(f"Wait for the RBD image {image_name} to get deleted") assert verify_volume_deleted_in_backend( interface=constants.CEPHBLOCKPOOL, image_uuid=image_uid, pool_name=pool_name, timeout=300, ), f"Wait timeout - RBD image {image_name} is not deleted" logger.info(f"Verified: RBD image {image_name} is deleted")
def disruptive_base(self, interface, operation_to_disrupt, resource_to_delete): """ Base function for disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=1) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)["items"]) # Fetch PV names pv_objs = [] for pvc_obj in self.pvc_objs: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in self.pod_objs: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName"] for pvc_obj in self.pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in self.pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": pod_obj.pvc.storage_type = "block" else: pod_obj.pvc.storage_type = "fs" pod_obj.workload_setup(storage_type=pod_obj.pvc.storage_type) log.info("Setup for running IO is completed on pods") # Start IO on each pod. RWX PVC will be used on two pods. So split the # size accordingly log.info("Starting IO on pods") for pod_obj in self.pod_objs: if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX: io_size = int((self.pvc_size - 1) / 2) else: io_size = self.pvc_size - 1 pod_obj.run_io( storage_type=pod_obj.pvc.storage_type, size=f"{io_size}G", fio_filename=f"{pod_obj.name}_io", end_fsync=1, ) log.info("IO started on all pods.") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, self.pod_objs, wait=False) if operation_to_disrupt == "delete_pods": ret = wait_for_resource_count_change( get_all_pods, initial_num_of_pods, self.namespace, "decrease", timeout=50, ) assert ret, "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in self.pod_objs: assert pod_obj.ocp.wait_for_delete( pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted" log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod for node, pvs in node_pv_dict.items(): cmd = f"oc debug nodes/{node} -- df" df_on_node = run_cmd(cmd) for pv in pvs: assert pv not in df_on_node, ( f"{pv} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") # Fetch image uuid associated with PVCs pvc_uuid_map = {} for pvc_obj in self.pvc_objs: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs) if operation_to_disrupt == "delete_pvcs": ret = wait_for_resource_count_change(get_all_pvcs, initial_num_of_pvc, self.namespace, "decrease", timeout=50) assert ret, "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") disruption.delete_resource() pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in self.pvc_objs: assert pvc_obj.ocp.wait_for_delete( pvc_obj.name), f"PVC {pvc_obj.name} is not deleted" log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: assert pv_obj.ocp.wait_for_delete( pv_obj.name, 120), f"PV {pv_obj.name} is not deleted" log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid, pool_name=pool_name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")