def test_create_scale_pods_and_pvcs_with_ms_consumers( self, create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers): """ Test create scale pods and PVCs using a kube job with MS consumers """ self.orig_index = config.cur_index self.consumer_i_per_fio_scale = ( create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers( scale_count=self.scale_count, pvc_per_pod_count=self.pvc_per_pod_count, )) assert config.cur_index == self.orig_index, "The current index has changed" config.switch_to_provider() time_to_wait_for_io_running = 120 log.info(f"Wait {time_to_wait_for_io_running} seconds for checking " f"that the IO running as expected") sleep(time_to_wait_for_io_running) ceph_health_check() log.info("Checking the Ceph Health on the consumers") consumer_indexes = config.get_consumer_indexes_list() for i in consumer_indexes: config.switch_ctx(i) ceph_health_check() self.check_scale_pods_and_pvcs_created_on_consumers() log.info( "The scale pods and PVCs using a kube job with MS consumers created successfully" )
def test_automated_recovery_from_failed_nodes_reactive_ms( self, nodes, failure, ): """ We have 3 test cases to check when running IO in the background: A) Automated recovery from stopped worker node B) Automated recovery from termination of a worker node C) Automated recovery from unschedule and reschedule a worker node. """ self.create_resources() config.switch_to_provider() log.info("Start executing the node test function on the provider...") FAILURE_TYPE_FUNC_CALL_DICT[failure](nodes) # Verification steps after the automated recovery. assert check_pods_after_node_replacement( ), "Not all the pods are running" assert (verify_worker_nodes_security_groups() ), "Not all the worker nodes security groups set correctly" log.info("Checking that the ceph health is ok on the provider") ceph_health_check() log.info("Checking that the ceph health is ok on the consumers") consumer_indexes = config.get_consumer_indexes_list() for i in consumer_indexes: config.switch_ctx(i) ceph_health_check()
def finalizer(): # Use provider cluster in managed service platform if self.consumer_cluster_index is not None: config.switch_to_provider() # Validate all mon services are running if len(mon_svc_list) != len( get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )): # Restart the rook-operator pod operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, ) # Wait till all mon services are up for svc_list in TimeoutSampler( 1200, len(mon_svc_list), get_services_by_label, constants.MON_APP_LABEL, constants.OPENSHIFT_STORAGE_NAMESPACE, ): try: if len(svc_list) == len(mon_svc_list): log.info("All expected mon services are up") break except IndexError: log.error( f"All expected mon services are not up only found :{svc_list}. " f"Expected: {mon_svc_list}") # Wait till all mon pods running POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=len(mon_pods_list), timeout=600, sleep=3, ) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Switch the context to consumer cluster if needed if self.consumer_cluster_index is not None: config.switch_to_consumer(self.consumer_cluster_index)
def finalizer(): config.switch_to_provider() log.info( "Verify that all the worker nodes are in a Ready state on the provider" ) wnodes = get_nodes(node_type=constants.WORKER_MACHINE) for wnode in wnodes: is_recovered = recover_node_to_ready_state(wnode) if not is_recovered: log.warning(f"The node {wnode.name} has failed to recover") log.info("Verify again that the ceph health is OK") ceph_health_check() config.switch_ctx(self.orig_index)
def test_osd_node_restart_and_check_osd_pods_status(self, nodes): """ 1) Restart one of the osd nodes. 2) Check that the osd pods associated with the node should change to a Terminating state. 3) Wait for the node to reach Ready state. 4) Check that the new osd pods with the same ids start on the same node. 5) Check the worker nodes security groups. """ # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162 if is_ms_consumer_cluster(): logger.info( "The test is applicable only for an MS provider cluster. " "Switching to the provider cluster...") config.switch_to_provider() self.create_resources() osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] old_osd_pod_ids = get_node_osd_ids(osd_node_name) logger.info(f"osd pod ids: {old_osd_pod_ids}") node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids) node_osd_pod_names = [p.name for p in node_osd_pods] logger.info(f"Going to restart the node {osd_node_name}") nodes.restart_nodes(nodes=[osd_node], wait=False) logger.info("Verify the node osd pods go into a Terminating state") res = pod.wait_for_pods_to_be_in_statuses( [constants.STATUS_TERMINATING], node_osd_pod_names) assert res, "Not all the node osd pods are in a Terminating state" wait_for_nodes_status(node_names=[osd_node_name]) assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids, timeout=300) logger.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" ) logger.info( "Verify the worker nodes security groups on the provider...") assert verify_worker_nodes_security_groups()
def test_create_scale_pods_and_pvcs_using_kube_job_ms( self, create_scale_pods_and_pvcs_using_kube_job): """ Test create scale pods and PVCs using a kube job with managed service """ self.orig_index = config.cur_index config.switch_to_consumer() log.info("Start creating resources using kube job...") create_scale_pods_and_pvcs_using_kube_job() ceph_health_check() log.info("Switch to the provider") config.switch_to_provider() time_to_wait_for_io_running = 120 log.info(f"Wait {time_to_wait_for_io_running} seconds for checking " f"that the IO running as expected") sleep(time_to_wait_for_io_running) ceph_health_check() log.info("Switch back to the consumer") config.switch_to_consumer() log.info("The resources created successfully using the kube job")
def get_admin_key_from_provider(): """ Get admin key from rook-ceph-tools pod on provider Returns: str: The admin key obtained from rook-ceph-tools pod on provider. Return empty string if admin key is not obtained. """ initial_cluster_index = config.cur_index config.switch_to_provider() admin_key = "" try: # Get the key from provider cluster rook-ceph-tools pod provider_tools_pod = get_ceph_tools_pod() admin_key = (provider_tools_pod.exec_cmd_on_pod( "grep key /etc/ceph/keyring").strip().split()[-1]) except Exception as exc: logger.error( f"Couldn't find admin key from provider due to the error:\n{str(exc)}" ) finally: config.switch_ctx(initial_cluster_index) return admin_key
def test_pod_disruptions(self, create_pvcs_and_pods): """ Test to perform pod disruption in consumer and provider cluster """ # List of pods to be disrupted. Using different list for consumer and provider for the easy implementation pods_on_consumer = [ "alertmanager_managed_ocs_alertmanager", "ocs_osd_controller_manager", "prometheus_managed_ocs_prometheus", "prometheus_operator", "ocs_operator", ] pods_on_provider = [ "alertmanager_managed_ocs_alertmanager", "ocs_osd_controller_manager", "prometheus_managed_ocs_prometheus", "prometheus_operator", "ocs_provider_server", "ocs_operator", ] disruption_on_consumer = [] disruption_on_provider = [] # Start I/O log.info("Starting fio on all pods") for pod_obj in self.io_pods: if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK: storage_type = "block" direct = 1 else: storage_type = "fs" direct = 0 pod_obj.run_io( storage_type=storage_type, size="10G", fio_filename=f"{pod_obj.name}", runtime=320, end_fsync=1, direct=direct, invalidate=0, fio_installed=True, ) consumer_index_iter = cycle(self.consumer_indexes) # Create Disruptions instance for each pod to be disrupted on consumer for pod_type in pods_on_consumer: consumer_index = next(consumer_index_iter) config.switch_ctx(consumer_index) disruption_obj = disruption_helpers.Disruptions() # Select each pod to be disrupted from different consumers disruption_obj.set_resource(resource=pod_type, cluster_index=consumer_index) disruption_obj.index_of_consumer = consumer_index disruption_on_consumer.append(disruption_obj) # Create Disruptions instance for each pod to be disrupted on provider config.switch_to_provider() for pod_type in pods_on_provider: disruption_obj = disruption_helpers.Disruptions() disruption_obj.set_resource( resource=pod_type, cluster_index=self.provider_cluster_index) disruption_on_provider.append(disruption_obj) # Delete pods on consumer one at a time log.info("Starting pod disruptions on consumer clusters") for disruptions_obj in disruption_on_consumer: disruptions_obj.delete_resource() # ocs-operator respin will trigger rook-ceph-tools pod respin. # Patch rook-ceph-tools pod to run ceph commands. if disruptions_obj.resource == "ocs_operator": config.switch_ctx(disruptions_obj.index_of_consumer) patch_consumer_toolbox() # Delete pods on provider one at a time log.info("Starting pod disruptions on provider cluster") for disruptions_obj in disruption_on_provider: disruptions_obj.delete_resource() log.info("Wait for IO to complete on pods") for pod_obj in self.io_pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") log.info("IO is successful on all pods") # Performs different checks in the clusters for cluster_index in [self.provider_cluster_index ] + self.consumer_indexes: config.switch_ctx(cluster_index) # Verify managedocs components are Ready log.info("Verifying managedocs components state") managedocs_obj = OCP( kind="managedocs", resource_name="managedocs", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) for component in {"alertmanager", "prometheus", "storageCluster"}: assert ( managedocs_obj.get()["status"]["components"][component] ["state"] == "Ready" ), f"{component} status is {managedocs_obj.get()['status']['components'][component]['state']}" # Verify storagecluster status log.info("Verifying storagecluster status") verify_storage_cluster() # Verify CSV status for managed_csv in { constants.OCS_CSV_PREFIX, constants.OSD_DEPLOYER, constants.OSE_PROMETHEUS_OPERATOR, }: csvs = csv.get_csvs_start_with_prefix( managed_csv, constants.OPENSHIFT_STORAGE_NAMESPACE) assert ( len(csvs) == 1 ), f"Unexpected number of CSVs with {managed_csv} prefix: {len(csvs)}" csv_name = csvs[0]["metadata"]["name"] csv_obj = csv.CSV( resource_name=csv_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) log.info(f"Check if {csv_name} is in Succeeded phase.") csv_obj.wait_for_phase(phase="Succeeded", timeout=600) # Verify the phase of ceph cluster log.info("Verify the phase of ceph cluster") cephcluster = OCP(kind="CephCluster", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) cephcluster_yaml = cephcluster.get().get("items")[0] expected_phase = "Connected" if cluster_index == self.provider_cluster_index: expected_phase = "Ready" assert ( cephcluster_yaml["status"]["phase"] == expected_phase ), f"Status of cephcluster {cephcluster_yaml['metadata']['name']} is {cephcluster_yaml['status']['phase']}" # Create PVC and pods on all consumer clusters log.info("Creating new PVCs and pods") pods = list() for cluster_index in self.consumer_indexes: config.switch_ctx(cluster_index) consumer_cluster_kubeconfig = os.path.join( config.clusters[cluster_index].ENV_DATA["cluster_path"], config.clusters[cluster_index].RUN.get("kubeconfig_location"), ) pvcs, io_pods = create_pvcs_and_pods( pvc_size=self.pvc_size, replica_count=1, pod_dict_path=constants.PERF_POD_YAML, ) for pvc_obj in pvcs: pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig for io_pod in io_pods: io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig pods.extend(io_pods) # Run I/O on new pods log.info("Running I/O on new pods") for pod_obj in pods: if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK: storage_type = "block" direct = 1 else: storage_type = "fs" direct = 0 pod_obj.run_io( storage_type=storage_type, size="10G", fio_filename=f"{pod_obj.name}", runtime=320, end_fsync=1, direct=direct, invalidate=0, fio_installed=True, ) log.info("Wait for I/O to complete on new pods") for pod_obj in pods: pod_obj.get_fio_results() log.info(f"Verified IO on the new pod {pod_obj.name}") log.info("IO is successful on new pods")
def test_resource_deletion_during_pvc_clone(self, pvc_clone_factory, pod_factory): """ Verify PVC clone will succeeded if rook-ceph, csi pods are re-spun while creating the clone """ pods_to_delete = [ "rbdplugin_provisioner", "cephfsplugin_provisioner", "cephfsplugin", "rbdplugin", "osd", "mgr", ] executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + len(pods_to_delete)) disruption_ops = [ disruption_helpers.Disruptions() for _ in pods_to_delete ] file_name = "file_clone" # Run IO log.info("Running fio on all pods to create a file") for pod_obj in self.pods: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=30, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") # Calculate md5sum file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info(f"md5sum obtained from pod {pod_obj.name}") log.info("IO is successful on all pods") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): cluster_index = None # 'provider_index' will not be None if the platform is Managed Services if self.provider_index is not None: if pod_type in ["osd", "mgr"]: cluster_index = self.provider_index config.switch_to_provider() else: cluster_index = self.consumer_index config.switch_ctx(cluster_index) disruption.set_resource(resource=pod_type, cluster_index=cluster_index) # Switch cluster context if the platform is MS. 'provider_index' will not be None if platform is MS. if self.provider_index is not None: config.switch_ctx(self.consumer_index) # Clone PVCs log.info("Start creating clone of PVCs") for pvc_obj in self.pvcs: log.info(f"Creating clone of PVC {pvc_obj.name}") pvc_obj.clone_proc = executor.submit( pvc_clone_factory, pvc_obj=pvc_obj, status="", access_mode=pvc_obj.get_pvc_access_mode, volume_mode=pvc_obj.volume_mode, ) log.info("Started creating clone") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get cloned PVCs clone_pvc_objs = [] for pvc_obj in self.pvcs: clone_obj = pvc_obj.clone_proc.result() clone_pvc_objs.append(clone_obj) log.info(f"Created clone {clone_obj.name} of PVC {pvc_obj.name}") log.info("Created clone of all PVCs") # Confirm that the cloned PVCs are Bound log.info("Verifying the cloned PVCs are Bound") for pvc_obj in clone_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) pvc_obj.reload() pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"] log.info("Verified: Cloned PVCs are Bound.") clone_pod_objs = [] # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") for pvc_obj in clone_pvc_objs: if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK: pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK, ) clone_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verify md5sum") for pod_obj in clone_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods") # Run IO log.info("Running IO on new pods") for pod_obj in clone_pod_objs: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on new pods") for pod_obj in clone_pod_objs: pod_obj.get_fio_results() log.info(f"Verified IO on new pod {pod_obj.name}") log.info("IO to completed on new pods")
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ if self.consumer_cluster_index is not None: # Switch to consumer to create PVC, pod and start IO config.switch_to_consumer(self.consumer_cluster_index) pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) if self.consumer_cluster_index is not None: # Switch to provider config.switch_to_provider() # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) svc_name = svc["metadata"]["name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_info = del_obj.get(resource_name=svc_name) del_obj.delete(resource_name=svc_name) # Delete pvc if is_lso_cluster(): mon_data_path = f"/var/lib/rook/mon-{mon_id}" mon_node = mon_info["spec"]["template"]["spec"][ "nodeSelector"]["kubernetes.io/hostname"] log.info( f"Delete the directory `{mon_data_path}` from {mon_node}") cmd = f"rm -rf {mon_data_path}" ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd]) else: log.info("Delete mon PVC") pvc_name = svc["metadata"]["labels"]["pvc_name"] pvc_obj = OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP(kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) svc_obj.delete(resource_name=svc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace( f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "")) new_data["data"] = ",".join([ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ]) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "")) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info( f"Waiting for {sleep_time} seconds before deleting another mon" ) time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}") log.info(f"All new mon endpoints are created {list_new_svc}") if self.consumer_cluster_index is not None: # Switch to consumer to create PVC, pod and run IO config.switch_to_consumer(self.consumer_cluster_index) # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
def test_resource_deletion_during_pvc_expansion(self, resource_to_delete): """ Verify PVC expansion will succeed when rook-ceph, csi pods are re-spun during expansion """ pvc_size_expanded = 30 executor = ThreadPoolExecutor(max_workers=len(self.pvcs)) disruption_ops = disruption_helpers.Disruptions() # Run IO to fill some data log.info("Running IO on all pods to fill some data before PVC expansion.") for pod_obj in self.pods: storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.run_io( storage_type=storage_type, size="4G", io_direction="write", runtime=30, rate="10M", fio_filename=f"{pod_obj.name}_f1", ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, ( f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}" ) log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods before PVC expansion.") if self.provider_index is not None: # Switch to provider cluster context to get ceph pods config.switch_to_provider() # Select the pod to be deleted disruption_ops.set_resource(resource=resource_to_delete) if self.provider_index is not None: config.switch_ctx(self.consumer_index) log.info("Expanding all PVCs.") for pvc_obj in self.pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G") pvc_obj.expand_proc = executor.submit( pvc_obj.resize_pvc, pvc_size_expanded, True ) # Delete the pod 'resource_to_delete' disruption_ops.delete_resource() # Verify pvc expand status for pvc_obj in self.pvcs: assert ( pvc_obj.expand_proc.result() ), f"Expansion failed for PVC {pvc_obj.name}" log.info("PVC expansion was successful on all PVCs") log.info("Verifying new size on pods.") for pod_obj in self.pods: if pod_obj.pvc.volume_mode == "Block": log.info( f"Skipping check on pod {pod_obj.name} as volume mode is Block." ) continue # Wait for 240 seconds to reflect the change on pod log.info(f"Checking pod {pod_obj.name} to verify the change.") for df_out in TimeoutSampler( 240, 3, pod_obj.exec_cmd_on_pod, command="df -kh" ): if not df_out: continue df_out = df_out.split() new_size_mount = df_out[df_out.index(pod_obj.get_storage_path()) - 4] if new_size_mount in [ f"{pvc_size_expanded - 0.1}G", f"{float(pvc_size_expanded)}G", f"{pvc_size_expanded}G", ]: log.info( f"Verified: Expanded size of PVC {pod_obj.pvc.name} " f"is reflected on pod {pod_obj.name}" ) break log.info( f"Expanded size of PVC {pod_obj.pvc.name} is not reflected" f" on pod {pod_obj.name}. New size on mount is not " f"{pvc_size_expanded}G as expected, but {new_size_mount}. " f"Checking again." ) log.info( f"Verified: Modified size {pvc_size_expanded}G is reflected on all pods." ) # Run IO to fill more data log.info("Write more data after PVC expansion.") for pod_obj in self.pods: storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.run_io( storage_type=storage_type, size="10G", io_direction="write", runtime=30, rate="10M", fio_filename=f"{pod_obj.name}_f2", end_fsync=1, ) log.info("Wait for IO to complete on all pods") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, ( f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}" ) log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods after PVC expansion.")
def test_disruptive_during_pod_pvc_deletion_and_io(self, interface, resource_to_delete, setup_base): """ Delete ceph/rook pod while PVCs deletion, pods deletion and IO are progressing """ # If the platform is Managed Services, then the ceph pods will be present in the provider cluster. # Consumer cluster will be the primary cluster. Switching to provider cluster is required to get ceph pods switch_to_provider_needed = ( True if (config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS) and (resource_to_delete in ["mds", "mon", "mgr", "osd"]) else False) pvc_objs, pod_objs, rwx_pod_objs = setup_base namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 3 num_of_io_pods = 1 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend([ pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (pod_obj.pvc == pod.pvc) ]) # Select pods to run IO io_pods = pod_objs[num_of_pods_to_delete:num_of_pods_to_delete + num_of_io_pods] io_pods.extend([ pod for pod in rwx_pod_objs for pod_obj in io_pods if (pod_obj.pvc == pod.pvc) ]) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods:] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend([ pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (pod_obj.pvc == pod.pvc) ]) io_pods = [ pod_obj for pod_obj in io_pods if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in io_pods]) ] log.info(f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC") log.info(f"{len(io_pods)} pods selected for running IO in which one " f"pair of pod share same RWX PVC") no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info(f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}") if switch_to_provider_needed: # Switch to provider cluster context to get ceph pods config.switch_to_provider() pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs)) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) if switch_to_provider_needed: # Switch back to consumer cluster context to access PVCs and pods config.switch_to_consumer(self.consumer_cluster_index) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data pods_for_pvc_io = [ pod_obj for pod_obj in pods_for_pvc if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_for_pvc]) ] log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc_io) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc_io: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. delete_pods(pods_for_pvc) for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Start IO on pods to be deleted pods_to_delete_io = [ pod_obj for pod_obj in pods_to_delete if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_to_delete]) ] log.info("Starting IO on selected pods to be deleted.") self.run_io_on_pods(pods_to_delete_io) log.info("IO started on selected pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) assert pvc_deleting.result( ), "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") assert pod_deleting.result( ), "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") # Delete pod of type 'resource_to_delete' disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info("Verified IO result on pods.") pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid, pool_name=pool_name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") if switch_to_provider_needed: # Switch to provider cluster context to get ceph pods config.switch_to_provider() # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") if switch_to_provider_needed: # Switch back to consumer cluster context config.switch_to_consumer(self.consumer_cluster_index) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def test_resource_deletion_during_pvc_pod_creation_and_io( self, interface, resource_to_delete, setup, multi_pvc_factory, pod_factory): """ Delete resource 'resource_to_delete' while PVCs creation, Pods creation and IO operation are progressing. """ # If the platform is Managed Services, then the ceph pods will be present in the provider cluster. # Consumer cluster will be the primary cluster. Switching to provider cluster is required to get ceph pods switch_to_provider_needed = ( True if (config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS) and (resource_to_delete in ["mds", "mon", "mgr", "osd"]) else False) num_of_new_pvcs = 5 pvc_objs, io_pods, pvc_objs_new_pods, access_modes = setup proj_obj = pvc_objs[0].project storageclass = pvc_objs[0].storageclass if switch_to_provider_needed: # Switch to provider cluster context to get ceph pods config.switch_to_provider() pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } executor = ThreadPoolExecutor(max_workers=len(io_pods)) disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) # Get number of pods of type 'resource_to_delete' initial_pods_num = len(pod_functions[resource_to_delete]()) if switch_to_provider_needed: # Switch back to consumer cluster context to access PVCs and pods config.switch_to_consumer(self.consumer_cluster_index) # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in io_pods: if pod_obj.pvc.volume_mode == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in io_pods: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break log.info("Setup for running IO is completed on pods") # Start creating new pods log.info("Start creating new pods.") bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs_new_pods, pod_factory, interface, 2) # Start creation of new PVCs log.info("Start creating new PVCs.") bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=proj_obj, storageclass=storageclass, size=self.pvc_size, access_modes=access_modes, access_modes_selection="distribute_random", status="", num_of_pvc=num_of_new_pvcs, wait_each=False, ) # Start IO on each pod log.info("Start IO on pods") for pod_obj in io_pods: if pod_obj.pvc.volume_mode == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file1", ) log.info("IO started on all pods.") # Delete the resource disruption.delete_resource() # Getting result of PVC creation as list of PVC objects pvc_objs_new = bulk_pvc_create.result() # Confirm PVCs are Bound for pvc_obj in pvc_objs_new: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) pvc_obj.reload() log.info("Verified: New PVCs are Bound.") # Getting result of pods creation as list of Pod objects pod_objs_new = bulk_pod_create.result() # Verify new pods are Running for pod_obj in pod_objs_new: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() log.info("Verified: All new pods are Running.") # Verify IO log.info("Fetching IO results from IO pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"IOPs after FIO on pod {pod_obj.name}:") log.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") log.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") log.info("Verified IO result on IO pods.") all_pod_objs = io_pods + pod_objs_new # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod in all_pod_objs: pod_info = pod.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Delete pods for pod_obj in all_pod_objs: pod_obj.delete(wait=False) # Verify pods are deleted for pod_obj in all_pod_objs: pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) if switch_to_provider_needed: # Switch to provider cluster context to get ceph pods config.switch_to_provider() # Verify number of 'resource_to_delete' type pods final_pods_num = len(pod_functions[resource_to_delete]()) assert final_pods_num == initial_pods_num, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{initial_pods_num}. Total number of pods present now: " f"{final_pods_num}") if switch_to_provider_needed: # Switch back to consumer cluster context config.switch_to_consumer(self.consumer_cluster_index) # Verify volumes are unmapped from nodes after deleting the pods node_pv_mounted = helpers.verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") # Set volume mode on PVC objects for pvc_obj in pvc_objs_new: pvc_info = pvc_obj.get() setattr(pvc_obj, "volume_mode", pvc_info["spec"]["volumeMode"]) # Verify that PVCs are reusable by creating new pods all_pvc_objs = pvc_objs + pvc_objs_new pod_objs_re = helpers.create_pods(all_pvc_objs, pod_factory, interface, 2) # Verify pods are Running for pod_obj in pod_objs_re: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() log.info("Successfully created new pods using all PVCs.") # Run IO on each of the newly created pods for pod_obj in pod_objs_re: if pod_obj.pvc.volume_mode == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) log.info("Fetching IO results from newly created pods") for pod_obj in pod_objs_re: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"IOPs after FIO on pod {pod_obj.name}:") log.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") log.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") log.info("Verified IO result on newly created pods.")