def test_daemon_kill_during_pvc_pod_creation_and_io( self, interface, resource_name, setup, multi_pvc_factory, pod_factory ): """ Kill 'resource_name' daemon while PVCs creation, pods creation and IO operation are progressing. """ num_of_new_pvcs = 5 pvc_objs, io_pods, pvc_objs_new_pods, access_modes = setup proj_obj = pvc_objs[0].project storageclass = pvc_objs[0].storageclass pod_functions = { 'mds': partial(get_mds_pods), 'mon': partial(get_mon_pods), 'mgr': partial(get_mgr_pods), 'osd': partial(get_osd_pods), 'rbdplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial(get_cephfsplugin_provisioner_pods), 'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods), 'operator': partial(get_operator_pods) } executor = ThreadPoolExecutor(max_workers=len(io_pods)) disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_name) # Get number of pods of type 'resource_name' resource_pods_num = len(pod_functions[resource_name]()) # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in io_pods: if pod_obj.pvc.volume_mode == 'Block': storage_type = 'block' else: storage_type = 'fs' executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in io_pods: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler( 180, 2, getattr, pod_obj, 'wl_setup_done' ): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on pods") # Set daemon to be killed disruption.select_daemon() # Start creating new pods log.info("Start creating new pods.") bulk_pod_create = executor.submit( helpers.create_pods, pvc_objs_new_pods, pod_factory, interface, 2 ) # Start creation of new PVCs log.info("Start creating new PVCs.") bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=proj_obj, storageclass=storageclass, size=self.pvc_size, access_modes=access_modes, access_modes_selection='distribute_random', status="", num_of_pvc=num_of_new_pvcs, wait_each=False ) # Start IO on each pod log.info("Start IO on pods") for pod_obj in io_pods: if pod_obj.pvc.volume_mode == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io( storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file1' ) log.info("IO started on all pods.") # Kill daemon disruption.kill_daemon() # Getting result of PVC creation as list of PVC objects pvc_objs_new = bulk_pvc_create.result() # Confirm PVCs are Bound for pvc_obj in pvc_objs_new: helpers.wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180 ) pvc_obj.reload() log.info("Verified: New PVCs are Bound.") # Getting result of pods creation as list of Pod objects pod_objs_new = bulk_pod_create.result() # Verify new pods are Running for pod_obj in pod_objs_new: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() log.info("Verified: All new pods are Running.") # Verify IO log.info("Fetching IO results from IO pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" ) log.info(f"IOPs after FIO on pod {pod_obj.name}:") log.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}" ) log.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}" ) log.info("Verified IO result on IO pods.") all_pod_objs = io_pods + pod_objs_new # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod in all_pod_objs: pod_info = pod.get() node = pod_info['spec']['nodeName'] pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Delete pods for pod_obj in all_pod_objs: pod_obj.delete(wait=False) # Verify pods are deleted for pod_obj in all_pod_objs: pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) # Verify number of 'resource_name' type pods final_resource_pods_num = len(pod_functions[resource_name]()) assert final_resource_pods_num == resource_pods_num, ( f"Total number of {resource_name} pods is not matching with " f"initial value. Total number of pods before daemon kill: " f"{resource_pods_num}. Total number of pods present now: " f"{final_resource_pods_num}" ) # Verify volumes are unmapped from nodes after deleting the pods node_pv_mounted = helpers.verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) # Set volume mode on PVC objects for pvc_obj in pvc_objs_new: pvc_info = pvc_obj.get() setattr(pvc_obj, 'volume_mode', pvc_info['spec']['volumeMode']) # Verify that PVCs are reusable by creating new pods all_pvc_objs = pvc_objs + pvc_objs_new pod_objs_re = helpers.create_pods( all_pvc_objs, pod_factory, interface, 2 ) # Verify pods are Running for pod_obj in pod_objs_re: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() log.info("Successfully created new pods using all PVCs.") # Run IO on each of the newly created pods for pod_obj in pod_objs_re: if pod_obj.pvc.volume_mode == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io( storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file2' ) log.info("Fetching IO results from newly created pods") for pod_obj in pod_objs_re: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" ) log.info(f"IOPs after FIO on pod {pod_obj.name}:") log.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}" ) log.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}" ) log.info("Verified IO result on newly created pods.")
def test_disruptive_during_pod_pvc_deletion_and_io( self, interface, resource_to_delete, setup_base ): """ Delete ceph/rook pod while PVCs deletion, pods deletion and IO are progressing """ pvc_objs, pod_objs, rwx_pod_objs = setup_base sc_obj = pvc_objs[0].storageclass namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 10 num_of_io_pods = 5 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend( [pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if ( pod_obj.pvc == pod.pvc )] ) # Select pods to run IO io_pods = pod_objs[num_of_pods_to_delete:num_of_pods_to_delete + num_of_io_pods] io_pods.extend( [pod for pod in rwx_pod_objs for pod_obj in io_pods if ( pod_obj.pvc == pod.pvc )] ) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods:] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend( [pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if ( pod_obj.pvc == pod.pvc )] ) log.info( f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC" ) log.info( f"{len(io_pods)} pods selected for running IO in which " f"{len(io_pods) - num_of_io_pods} pairs of pod share same " f"RWX PVC" ) no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info( f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}" ) pod_functions = { 'mds': get_mds_pods, 'mon': get_mon_pods, 'mgr': get_mgr_pods, 'osd': get_osd_pods } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=len(pod_objs)) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=namespace)['items'] ) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info['spec']['nodeName'] pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: executor.submit(pod_obj.workload_setup, storage_type='fs') # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: for sample in TimeoutSampler( 180, 2, getattr, pod_obj, 'wl_setup_done' ): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. assert self.delete_pods(pods_for_pvc), ( "Couldn't delete pods which are having PVCs to delete." ) for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Start IO on pods to be deleted log.info("Starting IO on pods to be deleted.") self.run_io_on_pods(pods_to_delete) log.info("IO started on pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(self.delete_pods, pods_to_delete) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type='decrease', min_difference=1, timeout=30, interval=0.01 ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type='decrease', min_difference=1, timeout=30, interval=0.01 ) assert pvc_deleting.result(), ( "Wait timeout: PVCs are not being deleted." ) log.info("PVCs deletion has started.") assert pod_deleting.result(), ( "Wait timeout: Pods are not being deleted." ) log.info("Pods deletion has started.") # Delete pod of type 'resource_to_delete' disruption.delete_resource() pods_deleted = pod_bulk_delete.result() assert pods_deleted, "Deletion of pods failed." # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) logging.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) logging.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=sc_obj.ceph_pool.name ) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid ) assert ret, ( f"Volume associated with PVC {pvc_name} still exists " f"in backend" ) log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" ) log.info("Verified IO result on pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) log.info("Ceph cluster health is OK")