def disruptive_base(self, interface, operation_to_disrupt, resource_to_delete): """ Base function for disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { 'mds': partial(get_mds_pods), 'mon': partial(get_mon_pods), 'mgr': partial(get_mgr_pods), 'osd': partial(get_osd_pods), 'rbdplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial(get_cephfsplugin_provisioner_pods), 'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods), 'operator': partial(get_operator_pods) } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=1) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)['items']) # Fetch PV names pv_objs = [] for pvc_obj in self.pvc_objs: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in self.pod_objs: pod_info = pod_obj.get() node = pod_info['spec']['nodeName'] pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] for pvc_obj in self.pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in self.pod_objs: pod_obj.workload_setup(storage_type='fs') log.info("Setup for running IO is completed on pods") # Start IO on each pod. RWX PVC will be used on two pods. So split the # size accordingly log.info("Starting IO on pods") for pod_obj in self.pod_objs: if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX: io_size = int((self.pvc_size - 1) / 2) else: io_size = self.pvc_size - 1 pod_obj.run_io(storage_type='fs', size=f'{io_size}G', fio_filename=f'{pod_obj.name}_io') log.info("IO started on all pods.") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, self.pod_objs, wait=False) if operation_to_disrupt == 'delete_pods': ret = wait_for_resource_count_change(get_all_pods, initial_num_of_pods, self.namespace, 'decrease') assert ret, "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in self.pod_objs: assert pod_obj.ocp.wait_for_delete( pod_obj.name, 180), (f"Pod {pod_obj.name} is not deleted") log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod for node, pvs in node_pv_dict.items(): cmd = f'oc debug nodes/{node} -- df' df_on_node = run_cmd(cmd) for pv in pvs: assert pv not in df_on_node, ( f"{pv} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") # Fetch image uuid associated with PVCs pvc_uuid_map = {} for pvc_obj in self.pvc_objs: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs) if operation_to_disrupt == 'delete_pvcs': ret = wait_for_resource_count_change(get_all_pvcs, initial_num_of_pvc, self.namespace, 'decrease') assert ret, "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") disruption.delete_resource() pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in self.pvc_objs: assert pvc_obj.ocp.wait_for_delete( pvc_obj.name), (f"PVC {pvc_obj.name} is not deleted") log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: assert pv_obj.ocp.wait_for_delete( pv_obj.name, 120), (f"PV {pv_obj.name} is not deleted") log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid, pool_name=pool_name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) log.info("Ceph cluster health is OK")
def test_disruptive_during_pod_pvc_deletion_and_io( self, interface, resource_to_delete, setup_base ): """ Delete ceph/rook pod while PVCs deletion, pods deletion and IO are progressing """ pvc_objs, pod_objs, rwx_pod_objs = setup_base sc_obj = pvc_objs[0].storageclass namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 10 num_of_io_pods = 5 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend( [pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if ( pod_obj.pvc == pod.pvc )] ) # Select pods to run IO io_pods = pod_objs[num_of_pods_to_delete:num_of_pods_to_delete + num_of_io_pods] io_pods.extend( [pod for pod in rwx_pod_objs for pod_obj in io_pods if ( pod_obj.pvc == pod.pvc )] ) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods:] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend( [pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if ( pod_obj.pvc == pod.pvc )] ) log.info( f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC" ) log.info( f"{len(io_pods)} pods selected for running IO in which " f"{len(io_pods) - num_of_io_pods} pairs of pod share same " f"RWX PVC" ) no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info( f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}" ) pod_functions = { 'mds': partial(get_mds_pods), 'mon': partial(get_mon_pods), 'mgr': partial(get_mgr_pods), 'osd': partial(get_osd_pods), 'rbdplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial( get_cephfsplugin_provisioner_pods ), 'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods), 'operator': partial(get_operator_pods) } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor( max_workers=len(pod_objs) + len(rwx_pod_objs) ) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=namespace)['items'] ) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info['spec']['nodeName'] pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: for sample in TimeoutSampler( 180, 2, getattr, pod_obj, 'wl_setup_done' ): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. delete_pods(pods_for_pvc) for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Start IO on pods to be deleted log.info("Starting IO on pods to be deleted.") self.run_io_on_pods(pods_to_delete) log.info("IO started on pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit( delete_pods, pods_to_delete, wait=False ) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type='decrease', min_difference=1, timeout=30, interval=0.01 ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type='decrease', min_difference=1, timeout=30, interval=0.01 ) assert pvc_deleting.result(), ( "Wait timeout: PVCs are not being deleted." ) log.info("PVCs deletion has started.") assert pod_deleting.result(), ( "Wait timeout: Pods are not being deleted." ) log.info("Pods deletion has started.") # Delete pod of type 'resource_to_delete' disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=sc_obj.ceph_pool.name ) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid ) assert ret, ( f"Volume associated with PVC {pvc_name} still exists " f"in backend" ) log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" ) log.info("Verified IO result on pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) log.info("Ceph cluster health is OK")
def test_pvc_disruptive( self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory ): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { 'mds': partial(pod.get_mds_pods), 'mon': partial(pod.get_mon_pods), 'mgr': partial(pod.get_mgr_pods), 'osd': partial(pod.get_osd_pods), 'rbdplugin': partial(pod.get_plugin_pods, interface=interface), 'cephfsplugin': partial(pod.get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial( pod.get_cephfsplugin_provisioner_pods ), 'rbdplugin_provisioner': partial( pod.get_rbdfsplugin_provisioner_pods ), 'operator': partial(pod.get_operator_pods) } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) num_of_pvc = 12 namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=namespace)['items'] ) executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) DISRUPTION_OPS.set_resource(resource=resource_to_delete) access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend( [ f'{constants.ACCESS_MODE_RWO}-Block', f'{constants.ACCESS_MODE_RWX}-Block' ] ) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=5, access_modes=access_modes, access_modes_selection='distribute_random', status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False ) if operation_to_disrupt == 'create_pvc': # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, 'increase' ) assert ret, "Wait timeout: PVCs are not being created." logger.info( f"PVCs creation has started." ) DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120 ) pvc_obj.reload() logger.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit( helpers.create_pods, pvc_objs, pod_factory, interface, 2 ) if operation_to_disrupt == 'create_pod': # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, 'increase' ) assert ret, "Wait timeout: Pods are not being created." logger.info( f"Pods creation has started." ) DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() logger.info("Verified: All pods are Running.") # Do setup on pods for running IO logger.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: for sample in TimeoutSampler( 180, 2, getattr, pod_obj, 'wl_setup_done' ): if sample: logger.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break logger.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io( storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file1' ) logger.info("FIO started on all pods.") if operation_to_disrupt == 'run_io': DISRUPTION_OPS.delete_resource() logger.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" ) logger.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() logging.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io( storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file2' ) logger.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" ) logger.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) logger.info("Ceph cluster health is OK")
def test_ceph_daemon_kill_during_resource_creation( self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory, ): """ Base function for ceph daemon kill disruptive tests. Deletion of 'resource_to_delete' daemon will be introduced while 'operation_to_disrupt' is progressing. """ disruption = disruption_helpers.Disruptions() pod_functions = { "mds": partial(pod.get_mds_pods), "mon": partial(pod.get_mon_pods), "mgr": partial(pod.get_mgr_pods), "osd": partial(pod.get_osd_pods), "rbdplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(pod.get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(pod.get_rbdfsplugin_provisioner_pods), "operator": partial(pod.get_operator_pods), } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) disruption.set_resource(resource=resource_to_delete) disruption.select_daemon() access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) num_of_pvc = 8 access_mode_dist_ratio = [6, 2] # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend( [ f"{constants.ACCESS_MODE_RWO}-Block", f"{constants.ACCESS_MODE_RWX}-Block", ] ) num_of_pvc = 9 access_mode_dist_ratio = [4, 3, 2] executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=8, access_modes=access_modes, access_modes_selection="distribute_random", access_mode_dist_ratio=access_mode_dist_ratio, status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False, timeout=90, ) if operation_to_disrupt == "create_pvc": # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, "increase" ) assert ret, "Wait timeout: PVCs are not being created." log.info("PVCs creation has started.") disruption.kill_daemon() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120 ) pvc_obj.reload() log.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit( helpers.create_pods, pvc_objs, pod_factory, interface, 2, nodes=node.get_worker_nodes(), ) if operation_to_disrupt == "create_pod": # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, "increase" ) assert ret, "Wait timeout: Pods are not being created." log.info("Pods creation has started.") disruption.kill_daemon() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) pod_obj.reload() log.info("Verified: All pods are Running.") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="2G", runtime=30, fio_filename=f"{pod_obj.name}_io_file1", ) log.info("FIO started on all pods.") if operation_to_disrupt == "run_io": disruption.kill_daemon() log.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"FIO is success on pod {pod_obj.name}") log.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods( pvc_objs, pod_factory, interface, 2, nodes=node.get_worker_nodes() ) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() log.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) log.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"FIO is success on pod {pod_obj.name}") log.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def test_disruptive_during_pod_pvc_deletion_and_io(self, interface, resource_to_delete, setup_base): """ Delete ceph/rook pod while PVCs deletion, pods deletion and IO are progressing """ # If the platform is Managed Services, then the ceph pods will be present in the provider cluster. # Consumer cluster will be the primary cluster. Switching to provider cluster is required to get ceph pods switch_to_provider_needed = ( True if (config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS) and (resource_to_delete in ["mds", "mon", "mgr", "osd"]) else False) pvc_objs, pod_objs, rwx_pod_objs = setup_base namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 3 num_of_io_pods = 1 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend([ pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (pod_obj.pvc == pod.pvc) ]) # Select pods to run IO io_pods = pod_objs[num_of_pods_to_delete:num_of_pods_to_delete + num_of_io_pods] io_pods.extend([ pod for pod in rwx_pod_objs for pod_obj in io_pods if (pod_obj.pvc == pod.pvc) ]) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods:] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend([ pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (pod_obj.pvc == pod.pvc) ]) io_pods = [ pod_obj for pod_obj in io_pods if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in io_pods]) ] log.info(f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC") log.info(f"{len(io_pods)} pods selected for running IO in which one " f"pair of pod share same RWX PVC") no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info(f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}") if switch_to_provider_needed: # Switch to provider cluster context to get ceph pods config.switch_to_provider() pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs)) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) if switch_to_provider_needed: # Switch back to consumer cluster context to access PVCs and pods config.switch_to_consumer(self.consumer_cluster_index) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data pods_for_pvc_io = [ pod_obj for pod_obj in pods_for_pvc if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_for_pvc]) ] log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc_io) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc_io: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. delete_pods(pods_for_pvc) for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Start IO on pods to be deleted pods_to_delete_io = [ pod_obj for pod_obj in pods_to_delete if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_to_delete]) ] log.info("Starting IO on selected pods to be deleted.") self.run_io_on_pods(pods_to_delete_io) log.info("IO started on selected pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) assert pvc_deleting.result( ), "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") assert pod_deleting.result( ), "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") # Delete pod of type 'resource_to_delete' disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info("Verified IO result on pods.") pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid, pool_name=pool_name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") if switch_to_provider_needed: # Switch to provider cluster context to get ceph pods config.switch_to_provider() # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") if switch_to_provider_needed: # Switch back to consumer cluster context config.switch_to_consumer(self.consumer_cluster_index) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def uninstall_cluster_logging(): """ Function to uninstall cluster-logging from the cluster * Deletes the project "openshift-logging" and "openshift-operators-redhat" """ # Validating the pods before deleting the instance pod_list = get_all_pods(namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) for pod in pod_list: logger.info( f"Pods running in the openshift-logging namespace {pod.name}") # Excluding cluster-logging-operator from pod_list and getting pod names pod_names_list = [ pod.name for pod in pod_list if not pod.name.startswith('cluster-logging-operator') ] # Deleting the clusterlogging instance clusterlogging_obj = ocp.OCP( kind=constants.CLUSTER_LOGGING, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) assert clusterlogging_obj.delete(resource_name='instance') check_pod_vanished(pod_names_list) # Deleting the PVCs pvc_obj = ocp.OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) pvc_list = get_all_pvcs(namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) for pvc in range(len(pvc_list) - 1): pvc_obj.delete( resource_name=pvc_list['items'][pvc]['metadata']['name']) pvc_obj.wait_for_delete( resource_name=pvc_list['items'][pvc]['metadata']['name']) # Deleting the RBAC permission set rbac_role = ocp.OCP( kind=constants.ROLE, namespace=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE) rbac_role.delete(yaml_file=constants.EO_RBAC_YAML) # Deleting the projects openshift_logging_namespace = ocp.OCP( kind=constants.NAMESPACES, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) openshift_operators_redhat_namespace = ocp.OCP( kind=constants.NAMESPACES, namespace=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE) if openshift_logging_namespace.get(): assert openshift_logging_namespace.delete( resource_name=constants.OPENSHIFT_LOGGING_NAMESPACE) logger.info( f"The namespace openshift-logging got deleted successfully") if openshift_operators_redhat_namespace.get(): assert openshift_operators_redhat_namespace.delete( resource_name=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE) logger.info( f"The project openshift-opertors-redhat got deleted successfully")
def disruptive_base(self, operation_to_disrupt, resource_to_delete): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)['items']) executor = ThreadPoolExecutor(max_workers=1) DISRUPTION_OPS.set_resource(resource=resource_to_delete) # Start creation of multiple PVCs. Create 5 PVCs bulk_pvc_create = executor.submit(helpers.create_multiple_pvcs, sc_name=self.sc_obj.name, namespace=self.namespace, number_of_pvc=5) if operation_to_disrupt == 'create_pvc': # Ensure PVCs are being created before deleting the resource ret = self.verify_resource_creation(get_all_pvcs, initial_num_of_pvc) assert ret, "Wait timeout: PVCs are not being created." logging.info(f"PVCs creation has started.") DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() # Verify PVCs are Bound for pvc_obj in pvc_objs: assert pvc_obj.ocp.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=pvc_obj.name, timeout=120 ), (f"Wait timeout: PVC {pvc_obj.name} is not in 'Bound' status " f"even after 120 seconds.") logging.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs_list=pvc_objs, interface_type=self.interface, wait=False, namespace=self.namespace) if operation_to_disrupt == 'create_pod': # Ensure that pods are being created before deleting the resource ret = self.verify_resource_creation(pod.get_all_pods, initial_num_of_pods) assert ret, "Wait timeout: Pods are not being created." logging.info(f"Pods creation has started.") DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: assert pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=120), ( f"Wait timeout: Pod {pod_obj.name} is not in 'Running' " f"state even after 120 seconds.") logging.info("Verified: All pods are Running.") # Start IO on each pod for pod_obj in pod_objs: pod_obj.run_io(storage_type='fs', size='1G', runtime=10, fio_filename='fio-file1') logging.info("FIO started on all pods.") if operation_to_disrupt == 'run_io': DISRUPTION_OPS.delete_resource() logging.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() logging.info(f"IOPs after FIO on pod {pod_obj.name}:") logging.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") logging.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") logging.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) # Verify that PVCs are reusable by creating new pods create_pods = executor.submit(helpers.create_pods, pvc_objs_list=pvc_objs, interface_type=self.interface, wait=False, namespace=self.namespace) pod_objs = create_pods.result() # Verify new pods are Running for pod_obj in pod_objs: assert pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=120), ( f"Wait timeout: Pod {pod_obj.name} is not in 'Running' " f"state even after 120 seconds.") logging.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pod_obj.run_io(storage_type='fs', size='1G', runtime=10, fio_filename='fio-file2') logging.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() logging.info(f"IOPs after FIO on pod {pod_obj.name}:") logging.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") logging.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") logging.info("Verified FIO result on new pods.") # Delete new pods for pod_obj in pod_objs: pod_obj.delete() # Delete PVCs for pvc_obj in pvc_objs: pvc_obj.delete()