def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) # TODO: Workaround for BZ1748325: mons = pod.get_mon_pods(self.mon_selector, self.namespace) for mon in mons: if mon.ocp.get_resource_status( mon.name) == constant.STATUS_RUNNING: self.mons.append(mon) # TODO: End of workaround for BZ1748325 self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.noobaas = pod.get_noobaa_pods(self.noobaa_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs: self.cephfs.reload() else: try: self.cephfs_config = self.CEPHFS.get().get('items')[0] self.cephfs = ocs.OCS(**self.cephfs_config) self.cephfs.reload() except IndexError as e: logging.warning(e) logging.warning("No CephFS found") self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds) self.noobaa_count = len(self.noobaas)
def finalizer(): must_gather_pods = pod.get_all_pods( selector_label='app=must-gather') logger.info(f"must_gather_pods: {must_gather_pods} ") sample_pods = TimeoutSampler( timeout=30, sleep=3, func=check_for_must_gather_pod, ) sample_namespace = TimeoutSampler( timeout=30, sleep=3, func=check_for_must_gather_project, ) if sample_pods.wait_for_func_status(result=True): for must_gather_pod in must_gather_pods: self.ocp_obj.wait_for_delete(resource_name=must_gather_pod) logger.info(f"deleted pods: {must_gather_pods}") if not sample_namespace.wait_for_func_status(result=False): must_gather_namespace = check_for_must_gather_project() logger.info(f"namespace to delete: {must_gather_namespace}") self.ocp_obj.wait_for_delete( resource_name=must_gather_namespace)
def setup_persistent_monitoring(): """ Change monitoring backend to OCS """ sc = helpers.default_storage_class(interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus", "alertmanager"], ) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get("telemeter_server_url"), ) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)( validate_pods_are_respinned_and_running_state )(pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods )(pods_list)
def test_monitoring_after_restarting_prometheus_pod(self, pods): """ Test case to validate prometheus pod restart should not have any functional impact """ # Get the prometheus pod prometheus_pod_obj = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_object in prometheus_pod_obj: # Get the pvc which mounted on prometheus pod pod_info = pod_object.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Restart the prometheus pod pod_object.delete(force=True) pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert pod_obj.wait_for_resource(condition='Running', selector='app=prometheus', timeout=60) # Check the same pvc is mounted on new pod pod_info = pod_object.get() assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_object.name}" ) for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def check_health_of_clusterlogging(): """ * Checks for ElasticSearch, curator, fluentd and kibana pods in openshift-logging namespace * And check for the health of cluster logging, If status is green then the cluster is healthy,if status is red then health is bad Returns: list: Gives all the pods that are present in the namespace """ pod_list = [] pods = get_all_pods(namespace='openshift-logging') logger.info("Pods that are created by the instance") for pod in pods: pod_list.append(pod.name) logger.info(pod_list) elasticsearch_pod = [ pod for pod in pod_list if pod.startswith('elasticsearch') ] pod_obj = get_pod_obj( name=elasticsearch_pod[0], namespace='openshift-logging' ) status_check = pod_obj.exec_cmd_on_pod( command='es_util --query=_cluster/health?pretty', out_yaml_format=False ) logger.info(status_check) status_check = json.loads(status_check) if status_check['status'] == 'green': logger.info("Cluster logging is in Healthy state & Ready to use") else: raise UnexpectedBehaviour logger.error("Cluster logging is in Bad state") return pod_list
def respin_amq_app_pod(kafka_namespace, pod_pattern): """ Respin amq pod Args: kafka_namespace (str): Namespace for kafka pod_pattern (str): The pattern for the pod """ pod_obj = ocp.OCP(kind=constants.POD, namespace=kafka_namespace) pod_obj_list = get_all_pods(namespace=kafka_namespace) for pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, pod_pattern, kafka_namespace): try: if pod is not None: pod_obj.delete(resource_name=pod[0]) assert pod_obj.wait_for_resource( condition='Running', resource_count=len(pod_obj_list), timeout=300) break except IndexError as ie: log.error(" pod doesn't exist") raise ie
def test_disruptive_during_pod_pvc_deletion_and_io( self, interface, resource_to_delete, setup_base ): """ Delete ceph/rook pod while PVCs deletion, pods deletion and IO are progressing """ pvc_objs, pod_objs, rwx_pod_objs = setup_base namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 3 num_of_io_pods = 1 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (pod_obj.pvc == pod.pvc) ] ) # Select pods to run IO io_pods = pod_objs[ num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods ] io_pods.extend( [ pod for pod in rwx_pod_objs for pod_obj in io_pods if (pod_obj.pvc == pod.pvc) ] ) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (pod_obj.pvc == pod.pvc) ] ) log.info( f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC" ) log.info( f"{len(io_pods)} pods selected for running IO in which " f"{len(io_pods) - num_of_io_pods} pairs of pod share same " f"RWX PVC" ) no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info( f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}" ) pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs)) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. delete_pods(pods_for_pvc) for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Start IO on pods to be deleted log.info("Starting IO on pods to be deleted.") self.run_io_on_pods(pods_to_delete) log.info("IO started on pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") assert pod_deleting.result(), "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") # Delete pod of type 'resource_to_delete' disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=pool_name ) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid ) assert ret, ( f"Volume associated with PVC {pvc_name} still exists " f"in backend" ) log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info("Verified IO result on pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def test_pvc_disruptive(self, storageclass, namespace, interface, operation_to_disrupt, resource_to_delete, teardown_factory): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)['items']) executor = ThreadPoolExecutor(max_workers=1) DISRUPTION_OPS.set_resource(resource=resource_to_delete) # Start creation of multiple PVCs. Create 5 PVCs bulk_pvc_create = executor.submit(helpers.create_multiple_pvcs, sc_name=storageclass.name, namespace=namespace, number_of_pvc=5) if operation_to_disrupt == 'create_pvc': # Ensure PVCs are being created before deleting the resource ret = self.verify_resource_creation(get_all_pvcs, initial_num_of_pvc, namespace) assert ret, "Wait timeout: PVCs are not being created." logging.info(f"PVCs creation has started.") DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() for pvc_obj in pvc_objs: teardown_factory(pvc_obj) # Verify PVCs are Bound for pvc_obj in pvc_objs: assert pvc_obj.ocp.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=pvc_obj.name, timeout=120 ), (f"Wait timeout: PVC {pvc_obj.name} is not in 'Bound' status " f"even after 120 seconds.") logging.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs_list=pvc_objs, interface_type=interface, namespace=namespace) if operation_to_disrupt == 'create_pod': # Ensure that pods are being created before deleting the resource ret = self.verify_resource_creation(pod.get_all_pods, initial_num_of_pods, namespace) assert ret, "Wait timeout: Pods are not being created." logging.info(f"Pods creation has started.") DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() for pod_obj in pod_objs: teardown_factory(pod_obj) # Verify pods are Running for pod_obj in pod_objs: assert pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=120), ( f"Wait timeout: Pod {pod_obj.name} is not in 'Running' " f"state even after 120 seconds.") logging.info("Verified: All pods are Running.") # Start IO on each pod for pod_obj in pod_objs: pod_obj.run_io(storage_type='fs', size='1G', runtime=10, fio_filename='fio-file1') logging.info("FIO started on all pods.") if operation_to_disrupt == 'run_io': DISRUPTION_OPS.delete_resource() logging.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() logging.info(f"IOPs after FIO on pod {pod_obj.name}:") logging.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") logging.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") logging.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) # Verify that PVCs are reusable by creating new pods create_pods = executor.submit(helpers.create_pods, pvc_objs_list=pvc_objs, interface_type=interface, namespace=namespace) pod_objs = create_pods.result() for pod_obj in pod_objs: teardown_factory(pod_obj) # Verify new pods are Running for pod_obj in pod_objs: assert pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=120), ( f"Wait timeout: Pod {pod_obj.name} is not in 'Running' " f"state even after 120 seconds.") logging.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pod_obj.run_io(storage_type='fs', size='1G', runtime=10, fio_filename='fio-file2') logging.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() logging.info(f"IOPs after FIO on pod {pod_obj.name}:") logging.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") logging.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") logging.info("Verified FIO result on new pods.")
def create_scale_pods( self, scale_count=1500, pods_per_iter=5, io_runtime=None, pvc_size=None, start_io=None, ): """ Main Function with scale pod creation flow and checks to add nodes. For other platforms will not be considering the instance_type param Args: scale_count (int): Scale pod+pvc count io_runtime (sec): Fio run time in seconds start_io (bool): If True start IO else don't pods_per_iter (int): Number of PVC-POD to be created per PVC type pvc_size (Gi): size of PVC Example, If 5 then 20 PVC+POD will be created with 5 each of 4 PVC types Test value in-between 5-10 """ self.ms_name, all_pod_obj = ([] for i in range(2)) if not 5 <= pods_per_iter <= 10: raise UnexpectedBehaviour( "Pods_per_iter value should be in-between 5-15") # Check for expected worker count expected_worker_count = get_expected_worker_count(scale_count) if check_and_add_enough_worker(expected_worker_count): if (config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws"): for obj in machine.get_machineset_objs(): if "app" in obj.name: self.ms_name.append(obj.name) else: self.ms_name = [] # Create namespace self.create_and_set_namespace() # Continue to iterate till the scale pvc limit is reached while True: if scale_count <= len(all_pod_obj): logger.info(f"Scaled {scale_count} pvc and pods") if cluster.validate_pg_balancer(): logging.info( "OSD consumption and PG distribution is good to continue" ) else: raise UnexpectedBehaviour( "Unequal PG distribution to OSDs") break else: logger.info(f"Scaled PVC and POD count {len(all_pod_obj)}") self.pod_obj, self.pvc_obj = self.create_multi_pvc_pod( pods_per_iter, io_runtime, start_io, pvc_size) all_pod_obj.extend(self.pod_obj) try: # Check enough resources available in the dedicated app workers check_enough_resource_available_in_workers( self.ms_name, self.pod_dict_path) # Check for ceph cluster OSD utilization if not cluster.validate_osd_utilization(osd_used=75): logging.info("Cluster OSD utilization is below 75%") elif not cluster.validate_osd_utilization(osd_used=83): logger.warning("Cluster OSD utilization is above 75%") else: raise CephHealthException("Cluster OSDs are near full") # Check for 500 pods per namespace pod_objs = pod.get_all_pods( namespace=self.namespace_list[-1].namespace) if len(pod_objs) >= 500: self.create_and_set_namespace() except UnexpectedBehaviour: logging.error( f"Scaling of cluster failed after {len(all_pod_obj)} pod creation" ) raise UnexpectedBehaviour( "Scaling PVC+POD failed analyze setup and log for more details" )
def uninstall_cluster_logging(): """ Function to uninstall cluster-logging from the cluster Deletes the project "openshift-logging" and "openshift-operators-redhat" """ # Validating the pods before deleting the instance pod_list = get_all_pods(namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) for pod in pod_list: logger.info(f"Pods running in the openshift-logging namespace {pod.name}") # Excluding cluster-logging-operator from pod_list and getting pod names pod_names_list = [ pod.name for pod in pod_list if not pod.name.startswith("cluster-logging-operator") ] # Deleting the clusterlogging instance clusterlogging_obj = ocp.OCP( kind=constants.CLUSTER_LOGGING, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE ) assert clusterlogging_obj.delete(resource_name="instance") check_pod_vanished(pod_names_list) # Deleting the PVCs pvc_obj = ocp.OCP( kind=constants.PVC, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE ) pvc_list = get_all_pvcs(namespace=constants.OPENSHIFT_LOGGING_NAMESPACE) for pvc in range(len(pvc_list) - 1): pvc_obj.delete(resource_name=pvc_list["items"][pvc]["metadata"]["name"]) pvc_obj.wait_for_delete( resource_name=pvc_list["items"][pvc]["metadata"]["name"] ) # Deleting the RBAC permission set rbac_role = ocp.OCP( kind=constants.ROLE, namespace=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE ) rbac_role.delete(yaml_file=constants.EO_RBAC_YAML) # Deleting the projects openshift_logging_namespace = ocp.OCP( kind=constants.NAMESPACES, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE ) openshift_operators_redhat_namespace = ocp.OCP( kind=constants.NAMESPACES, namespace=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE, ) if openshift_logging_namespace.get(): assert openshift_logging_namespace.delete( resource_name=constants.OPENSHIFT_LOGGING_NAMESPACE ) logger.info("The namespace openshift-logging got deleted successfully") if openshift_operators_redhat_namespace.get(): assert openshift_operators_redhat_namespace.delete( resource_name=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE ) logger.info("The project openshift-opertors-redhat got deleted successfully")
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data['spec']['version'] ocs_version = config.ENV_DATA['ocs_version'] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}" ) assert ocs_version in csv_version, ( f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" ) # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( 'ocs_registry_image' ) if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}" ) ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch') if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV" ) else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}" ) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info( f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase" ) storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP( kind=constants.POD, namespace=namespace ) osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']) ) # check noobaa CR for min number of noobaa endpoint pods nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE) min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount') max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount') resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps } if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS: # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1 # post upgrade to OCS 4.5. Tracked with # https://github.com/red-hat-storage/ocs-ci/issues/2532 rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not ( post_upgrade_verification ) else 1 resources_dict.update({constants.RGW_APP_LABEL: rgw_count}) for label, count in resources_dict.items(): assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout ) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})" ) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP( kind=constants.STORAGECLASS, namespace=namespace ) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not skip_osd_distribution_check: log.info("Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] deviceset_count = get_deviceset_count() node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > deviceset_count, ( "OSD's are not distributed evenly across worker nodes" ) # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ( {item['metadata']['name'] for item in csi_driver.get()['items']} ) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD ) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS ) assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output." ) if ( config.DEPLOYMENT.get('local_storage') and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM ): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}" ) log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output." ) # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ] ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ('snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}" ) deployments = ocs_csv.get()['spec']['install']['spec']['deployments'] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator' ] assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in ( rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd( ceph_cmd='ceph osd crush dump', format='' ) pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check( namespace, health_check_tries, health_check_delay )
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1847098 if config.DEPLOYMENT.get('local_storage'): tools_pod = run_cmd( f"oc -n {self.namespace} get pod -l 'app=rook-ceph-tools' " f"-o jsonpath='{{.items[0].metadata.name}}'") pgs_to_autoscale = [ 'ocs-storagecluster-cephblockpool', 'ocs-storagecluster-cephfilesystem-data0' ] for pg in pgs_to_autoscale: run_cmd(f"oc -n {self.namespace} exec {tools_pod} -- " f"ceph osd pool set {pg} pg_autoscale_mode on") # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check(namespace=self.namespace) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") else: raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def base_setup(self, interface, pvc_factory, pod_factory): """ A setup phase for the test: get all the ceph pods information, create maxsize pvc, pod and run IO """ # Setting the io_size_gb to 40% of the total PVC capacity ceph_pod = Pod.get_ceph_tools_pod() external = config.DEPLOYMENT["external_mode"] if external: ocp_obj = ocp.OCP() if interface == constants.CEPHBLOCKPOOL: resource_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD elif interface == constants.CEPHFILESYSTEM: resource_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS cmd = f"get sc {resource_name} -o yaml" pool_data = ocp_obj.exec_oc_cmd(cmd) pool = pool_data["parameters"]["pool"] else: pool = (constants.DEFAULT_BLOCKPOOL if interface == constants.CEPHBLOCKPOOL else constants.DATA_POOL) ceph_replica = ceph_pod.exec_ceph_cmd( ceph_cmd=f"ceph osd pool get {pool} size") replica = ceph_replica["size"] ceph_status = ceph_pod.exec_ceph_cmd(ceph_cmd="ceph df") ceph_capacity = (int(ceph_status["stats"]["total_bytes"]) / replica / constants.GB) pvc_size_gb = int(ceph_capacity * 0.5) io_size_gb = int(pvc_size_gb * 0.4) io_size_gb = 400 if io_size_gb >= 400 else io_size_gb pod_objs = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ "noobaa", "rook-ceph-osd-prepare", "rook-ceph-drain-canary" ], exclude_selector=True, ) # Create maxsize pvc, app pod and run ios self.sc = default_storage_class(interface_type=interface) self.pvc_obj = pvc_factory( interface=interface, storageclass=self.sc, size=pvc_size_gb, ) self.pod_obj = pod_factory(interface=interface, pvc=self.pvc_obj) log.info(f"Running FIO to fill PVC size: {io_size_gb}G") self.pod_obj.run_io("fs", size=f"{io_size_gb}G", io_direction="write", runtime=480) log.info("Waiting for IO results") self.pod_obj.get_fio_results() return pod_objs
def disruptive_base(self, interface, operation_to_disrupt, resource_to_delete): """ Base function for disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=1) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)["items"]) # Fetch PV names pv_objs = [] for pvc_obj in self.pvc_objs: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in self.pod_objs: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName"] for pvc_obj in self.pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in self.pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": pod_obj.pvc.storage_type = "block" else: pod_obj.pvc.storage_type = "fs" pod_obj.workload_setup(storage_type=pod_obj.pvc.storage_type) log.info("Setup for running IO is completed on pods") # Start IO on each pod. RWX PVC will be used on two pods. So split the # size accordingly log.info("Starting IO on pods") for pod_obj in self.pod_objs: if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX: io_size = int((self.pvc_size - 1) / 2) else: io_size = self.pvc_size - 1 pod_obj.run_io( storage_type=pod_obj.pvc.storage_type, size=f"{io_size}G", fio_filename=f"{pod_obj.name}_io", end_fsync=1, ) log.info("IO started on all pods.") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, self.pod_objs, wait=False) if operation_to_disrupt == "delete_pods": ret = wait_for_resource_count_change( get_all_pods, initial_num_of_pods, self.namespace, "decrease", timeout=50, ) assert ret, "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in self.pod_objs: assert pod_obj.ocp.wait_for_delete( pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted" log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod for node, pvs in node_pv_dict.items(): cmd = f"oc debug nodes/{node} -- df" df_on_node = run_cmd(cmd) for pv in pvs: assert pv not in df_on_node, ( f"{pv} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") # Fetch image uuid associated with PVCs pvc_uuid_map = {} for pvc_obj in self.pvc_objs: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs) if operation_to_disrupt == "delete_pvcs": ret = wait_for_resource_count_change(get_all_pvcs, initial_num_of_pvc, self.namespace, "decrease", timeout=50) assert ret, "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") disruption.delete_resource() pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in self.pvc_objs: assert pvc_obj.ocp.wait_for_delete( pvc_obj.name), f"PVC {pvc_obj.name} is not deleted" log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: assert pv_obj.ocp.wait_for_delete( pv_obj.name, 120), f"PV {pv_obj.name} is not deleted" log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid, pool_name=pool_name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace) try: ceph_cluster.get().get("items")[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") if config.DEPLOYMENT["external_mode"]: logger.info("Deploying OCS on external mode RHCS") return self.deploy_with_external_mode() self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=rook-ceph-tools", resource_count=1, timeout=600, ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data["items"][0]["metadata"]["name"] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "persistent-monitoring"): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus", "alertmanager"], ) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url"), ) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") if self.platform == constants.VSPHERE_PLATFORM: update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def create_pod(interface_type=None, pvc_name=None, do_reload=True, namespace=defaults.ROOK_CLUSTER_NAMESPACE, node_name=None, pod_dict_path=None, sa_name=None, dc_deployment=False, raw_block_pv=False, raw_block_device=constants.RAW_BLOCK_DEVICE, replica_count=1, pod_name=None): """ Create a pod Args: interface_type (str): The interface type (CephFS, RBD, etc.) pvc_name (str): The PVC that should be attached to the newly created pod do_reload (bool): True for reloading the object after creation, False otherwise namespace (str): The namespace for the new resource creation node_name (str): The name of specific node to schedule the pod pod_dict_path (str): YAML path for the pod sa_name (str): Serviceaccount name dc_deployment (bool): True if creating pod as deploymentconfig raw_block_pv (bool): True for creating raw block pv based pod, False otherwise raw_block_device (str): raw block device for the pod replica_count (int): Replica count for deployment config pod_name (str): Name of the pod to create Returns: Pod: A Pod instance Raises: AssertionError: In case of any failure """ if interface_type == constants.CEPHBLOCKPOOL: pod_dict = pod_dict_path if pod_dict_path else constants.CSI_RBD_POD_YAML interface = constants.RBD_INTERFACE else: pod_dict = pod_dict_path if pod_dict_path else constants.CSI_CEPHFS_POD_YAML interface = constants.CEPHFS_INTERFACE if dc_deployment: pod_dict = pod_dict_path if pod_dict_path else constants.FEDORA_DC_YAML pod_data = templating.load_yaml(pod_dict) if not pod_name: pod_name = create_unique_resource_name(f'test-{interface}', 'pod') pod_data['metadata']['name'] = pod_name pod_data['metadata']['namespace'] = namespace if dc_deployment: pod_data['metadata']['labels']['app'] = pod_name pod_data['spec']['template']['metadata']['labels']['name'] = pod_name pod_data['spec']['replicas'] = replica_count if pvc_name: if dc_deployment: pod_data['spec']['template']['spec']['volumes'][0][ 'persistentVolumeClaim']['claimName'] = pvc_name else: pod_data['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] = pvc_name if interface_type == constants.CEPHBLOCKPOOL and raw_block_pv: pod_data['spec']['containers'][0]['volumeDevices'][0][ 'devicePath'] = raw_block_device pod_data['spec']['containers'][0]['volumeDevices'][0][ 'name'] = pod_data.get('spec').get('volumes')[0].get('name') if node_name: pod_data['spec']['nodeName'] = node_name else: if 'nodeName' in pod_data.get('spec'): del pod_data['spec']['nodeName'] if sa_name and dc_deployment: pod_data['spec']['template']['spec']['serviceAccountName'] = sa_name if dc_deployment: ocs_obj = create_resource(**pod_data) logger.info(ocs_obj.name) assert (ocp.OCP(kind='pod', namespace=namespace)).wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=pod_name + '-1-deploy', resource_count=0, timeout=180, sleep=3) dpod_list = pod.get_all_pods(namespace=namespace) for dpod in dpod_list: if '-1-deploy' not in dpod.name: if pod_name in dpod.name: return dpod else: pod_obj = pod.Pod(**pod_data) pod_name = pod_data.get('metadata').get('name') logger.info(f'Creating new Pod {pod_name} for test') created_resource = pod_obj.create(do_reload=do_reload) assert created_resource, (f"Failed to create Pod {pod_name}") return pod_obj
def test_pvc_disruptive( self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory, ): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { "mds": partial(pod.get_mds_pods), "mon": partial(pod.get_mon_pods), "mgr": partial(pod.get_mgr_pods), "osd": partial(pod.get_osd_pods), "rbdplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(pod.get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(pod.get_rbdfsplugin_provisioner_pods), "operator": partial(pod.get_operator_pods), } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) DISRUPTION_OPS.set_resource(resource=resource_to_delete) access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) num_of_pvc = 8 access_mode_dist_ratio = [6, 2] # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend([ f"{constants.ACCESS_MODE_RWO}-Block", f"{constants.ACCESS_MODE_RWX}-Block", ]) num_of_pvc = 9 access_mode_dist_ratio = [4, 3, 2] executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=5, access_modes=access_modes, access_modes_selection="distribute_random", access_mode_dist_ratio=access_mode_dist_ratio, status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False, timeout=90, ) if operation_to_disrupt == "create_pvc": # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, "increase") assert ret, "Wait timeout: PVCs are not being created." logger.info("PVCs creation has started.") DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120) pvc_obj.reload() logger.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit( helpers.create_pods, pvc_objs, pod_factory, interface, 2, nodes=node.get_worker_nodes(), ) if operation_to_disrupt == "create_pod": # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, "increase") assert ret, "Wait timeout: Pods are not being created." logger.info("Pods creation has started.") DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90) pod_obj.reload() logger.info("Verified: All pods are Running.") # Do setup on pods for running IO logger.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: logger.info( f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: logger.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break logger.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file1", ) logger.info("FIO started on all pods.") if operation_to_disrupt == "run_io": DISRUPTION_OPS.delete_resource() logger.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" logger.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods( pvc_objs, pod_factory, interface, 2, nodes=node.get_worker_nodes(), ) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90) pod_obj.reload() logger.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) logger.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" logger.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) logger.info("Ceph cluster health is OK")
def test_monitoring_when_one_of_the_prometheus_node_down( self, test_fixture): """ Test case to validate when the prometheus pod is down and interaction with prometheus """ namespace_list, pvc_objs, pod_objs, sc = test_fixture aws_obj = aws.AWS() # Get all the openshift-monitoring pods monitoring_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) # Get the worker node list workers = get_typed_nodes(node_type='worker') # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] prometheus_node = [ node for node in workers if node.get().get('metadata').get('name') == prometheus_node ] # Make one of the node down where the prometheus pod is hosted instances = aws.get_instances_ids_and_names(prometheus_node) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the monitoring pods are up for pod_obj in monitoring_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) # Check for the created pvc metrics after nodes restarting for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after restarting nodes namespaces = helpers.create_multilpe_projects(number_of_project=1) namespace_list.extend(namespaces) # Create pvcs after restarting nodes pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after restarting nodes pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod after restarting nodes for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )
def test_monitoring_delete_pvc(self): """ Test case to validate whether delete pvcs+configmap and recovery of a node where monitoring pods running has no functional impact """ # Get 'cluster-monitoring-config' configmap ocp_configmap = ocp.OCP(namespace=constants.MONITORING_NAMESPACE, kind='configmap') configmap_dict = ocp_configmap.get( resource_name='cluster-monitoring-config') dir_configmap = tempfile.mkdtemp(prefix='configmap_') yaml_file = f'{dir_configmap}/configmap.yaml' templating.dump_data_to_temp_yaml(configmap_dict, yaml_file) # Get prometheus and alertmanager pods prometheus_alertmanager_pods = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Get all pvc on monitoring namespace pvc_objs_list = pvc.get_all_pvc_objs( namespace=constants.MONITORING_NAMESPACE) # Delete configmap ocp_configmap.delete(resource_name='cluster-monitoring-config') # Delete all pvcs on monitoring namespace pvc.delete_pvcs(pvc_objs=pvc_objs_list) # Check all the prometheus and alertmanager pods are up for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Create configmap ocp_configmap.create(yaml_file=dir_configmap) # Check all the PVCs are up for pvc_obj in pvc_objs_list: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) # Check all the prometheus and alertmanager pods are up # and pvc are mounted on monitoring pods for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) mount_point = pod_obj.exec_cmd_on_pod( command="df -kh", out_yaml_format=False, ) assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}" log.info("Verified all pvc are mounted on monitoring pods") # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK")
def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_must_gather(self): """ Tests functionality of: oc adm must-gather """ # Fetch pod details pods = pod.get_all_pods(namespace='openshift-storage') pods = [each.name for each in pods] # Make logs root directory logger.info("Creating logs Directory") directory = self.make_directory() logger.info(f"Creating {directory}_ocs_logs - Done!") # Collect OCS logs logger.info("Collecting Logs") collect_ocs_logs(dir_name=directory, ocp=False) logger.info("Collecting logs - Done!") # Compare running pods list to "/pods" subdirectories must_gather_helper = re.compile(r'must-gather-.*.-helper') logger.info("Checking logs tree") logs = [ logs for logs in self.get_log_directories(directory) if not (must_gather_helper.match(logs)) ] logger.info(f"Logs: {logs}") logger.info(f"pods list: {pods}") assert set(sorted(logs)) == set(sorted(pods)), ( f"List of openshift-storage pods are not equal to list of logs " f"directories list of pods: {pods} list of log directories: {logs}" ) # 2nd test: Verify logs file are not empty logs_dir_list = self.search_log_files(directory) assert self.check_file_size(logs_dir_list), ( "One or more log file are empty") # Find must_gather_commands directory for verification for dir_root, dirs, files in os.walk(directory + "_ocs_logs"): if os.path.basename(dir_root) == 'must_gather_commands': logger.info( f"Found must_gather_commands directory - {dir_root}") assert 'json_output' in dirs, ( "json_output directory is not present in " "must_gather_commands directory.") assert files, ( "No files present in must_gather_commands directory.") cmd_files_path = [ os.path.join(dir_root, file_name) for file_name in files ] json_output_dir = os.path.join(dir_root, 'json_output') break # Verify that command output files are present as expected assert sorted(constants.MUST_GATHER_COMMANDS) == sorted(files), ( f"Actual and expected commands output files are not matching.\n" f"Actual: {files}\nExpected: {constants.MUST_GATHER_COMMANDS}") # Verify that files for command output in json are present as expected commands_json = os.listdir(json_output_dir) assert sorted( constants.MUST_GATHER_COMMANDS_JSON) == sorted(commands_json), ( f"Actual and expected json output commands files are not " f"matching.\nActual: {commands_json}\n" f"Expected: {constants.MUST_GATHER_COMMANDS_JSON}") # Verify that command output files are not empty empty_files = [] json_cmd_files_path = [ os.path.join(json_output_dir, file_name) for file_name in commands_json ] for file_path in cmd_files_path + json_cmd_files_path: if not os.path.getsize(file_path) > 0: empty_files.append(file_path) assert not empty_files, f"These files are empty: {empty_files}"
def test_ceph_daemon_kill_during_pod_pvc_deletion(self, interface, operation_to_disrupt, resource_name, setup_base): """ Kill 'resource_name' daemon while deletion of PVCs/pods is progressing """ pvc_objs, self.pod_objs = setup_base sc_obj = pvc_objs[0].storageclass self.namespace = pvc_objs[0].project.namespace pod_functions = { 'mds': partial(get_mds_pods), 'mon': partial(get_mon_pods), 'mgr': partial(get_mgr_pods), 'osd': partial(get_osd_pods), 'rbdplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin': partial(get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial(get_cephfsplugin_provisioner_pods), 'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods), 'operator': partial(get_operator_pods) } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_name) executor = ThreadPoolExecutor(max_workers=1) # Get number of pods of type 'resource_name' num_of_resource_pods = len(pod_functions[resource_name]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)['items']) # Fetch PV names pv_objs = [] for pvc_obj in pvc_objs: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in self.pod_objs: pod_info = pod_obj.get() node = pod_info['spec']['nodeName'] pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in self.pod_objs: pod_obj.workload_setup(storage_type='fs') log.info("Setup for running IO is completed on pods") # Start IO on each pod. RWX PVC will be used on two pods. So split the # size accordingly log.info("Starting IO on pods") for pod_obj in self.pod_objs: if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX: io_size = int((self.pvc_size - 1) / 2) else: io_size = self.pvc_size - 1 pod_obj.run_io(storage_type='fs', size=f'{io_size}G', fio_filename=f'{pod_obj.name}_io') log.info("IO started on all pods.") # Set the daemon to be killed disruption.select_daemon() # Start deleting pods pod_bulk_delete = executor.submit(self.delete_pods) if operation_to_disrupt == 'delete_pods': ret = self.verify_resource_deletion(get_all_pods, initial_num_of_pods) assert ret, "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") disruption.kill_daemon() pods_deleted = pod_bulk_delete.result() assert pods_deleted, "Deletion of pods failed." # Verify pods are deleted for pod_obj in self.pod_objs: assert pod_obj.ocp.wait_for_delete( pod_obj.name, 180), (f"Pod {pod_obj.name} is not deleted") log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod for node, pvs in node_pv_dict.items(): cmd = f'oc debug nodes/{node} -- df' df_on_node = run_cmd(cmd) for pv in pvs: assert pv not in df_on_node, ( f"{pv} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods.") # Fetch image uuid associated with PVCs pvc_uuid_map = {} for pvc_obj in pvc_objs: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvc_objs) if operation_to_disrupt == 'delete_pvcs': ret = self.verify_resource_deletion(get_all_pvcs, initial_num_of_pvc) assert ret, "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") disruption.kill_daemon() pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvc_objs: assert pvc_obj.ocp.wait_for_delete( pvc_obj.name), (f"PVC {pvc_obj.name} is not deleted") log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: assert pv_obj.ocp.wait_for_delete( pv_obj.name, 120), (f"PV {pv_obj.name} is not deleted") log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=sc_obj.ceph_pool.name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") # Verify number of pods of type 'resource_name' final_num_of_resource_pods = len(pod_functions[resource_name]()) assert final_num_of_resource_pods == num_of_resource_pods, ( f"Total number of {resource_name} pods is not matching with " f"initial value. Total number of pods before daemon kill: " f"{num_of_resource_pods}. Total number of pods present now: " f"{final_num_of_resource_pods}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) log.info("Ceph cluster health is OK")
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ _templating = templating.Templating() ceph_cluster = ocp.OCP( kind='CephCluster', namespace=self.namespace ) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") if not self.ocs_operator_deployment: create_oc_resource( 'common.yaml', self.cluster_path, _templating, config.ENV_DATA ) run_cmd( f'oc label namespace {config.ENV_DATA["cluster_namespace"]} ' f'"openshift.io/cluster-monitoring=true"' ) run_cmd( f"oc policy add-role-to-user view " f"system:serviceaccount:openshift-monitoring:prometheus-k8s " f"-n {self.namespace}" ) # HACK: If you would like to drop this hack, make sure that you # also updated docs and write appropriate unit/integration tests # for config processing. if config.ENV_DATA.get('monitoring_enabled') in ( "true", "True", True ): # RBAC rules for monitoring, based on documentation change in # rook: # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8 # HACK: This should be dropped when OCS is managed by OLM apply_oc_resource( 'rbac.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="monitoring" ) # Increased to 15 seconds as 10 is not enough # TODO: do the sampler function and check if resource exist wait_time = 15 logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) create_oc_resource( 'operator-openshift.yaml', self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) run_cmd( f"oc wait --for condition=ready pod " f"-l app=rook-ceph-operator " f"-n {self.namespace} " f"--timeout=120s" ) run_cmd( f"oc wait --for condition=ready pod " f"-l app=rook-discover " f"-n {self.namespace} " f"--timeout=120s" ) create_oc_resource( 'cluster.yaml', self.cluster_path, _templating, config.ENV_DATA ) else: self.deploy_ocs_via_operator() pod = ocp.OCP( kind=constants.POD, namespace=self.namespace ) cfs = ocp.OCP( kind=constants.CEPHFILESYSTEM, namespace=self.namespace ) # Check for Ceph pods assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mgr', timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600 ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600 ) if not self.ocs_operator_deployment: logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # HACK: This should be dropped (including service-monitor.yaml and # prometheus-rules.yaml files) when OCS is managed by OLM if config.ENV_DATA.get('monitoring_enabled') not in ( "true", "True", True ): # HACK: skip creation of rook-ceph-mgr service monitor when # monitoring is enabled (if this were not skipped, the step # would fail because rook would create the service monitor at # this point already) create_oc_resource( "service-monitor.yaml", self.cluster_path, _templating, config.ENV_DATA ) # HACK: skip creation of prometheus-rules, rook-ceph is # concerned with it's setup now, based on clarification from # Umanga Chapagain create_oc_resource( "prometheus-rules.yaml", self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # Create MDS pods for CephFileSystem fs_data = templating.load_yaml(constants.CEPHFILESYSTEM_YAML) fs_data['metadata']['namespace'] = self.namespace ceph_obj = OCS(**fs_data) ceph_obj.create() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds', resource_count=2, timeout=600 ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error( f"MDS deployment Failed! Please check logs!" ) if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get('persistent-monitoring'): # Create a pool, secrets and sc secret_obj = helpers.create_secret(interface_type=constants.CEPHBLOCKPOOL) cbj_obj = helpers.create_ceph_block_pool() sc_obj = helpers.create_storage_class( interface_type=constants.CEPHBLOCKPOOL, interface_name=cbj_obj.name, secret_name=secret_obj.name ) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager'] ) # Create configmap cluster-monitoring-config create_configmap_cluster_monitoring_pod(sc_obj.name) # Take some time to respin the pod waiting_time = 30 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state validate_pods_are_respinned_and_running_state( pods_list ) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods validate_pvc_are_mounted_on_monitoring_pods(pods_list) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check( namespace=self.namespace ) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") operator_selector = get_selector_for_ocs_operator() ocs_package_manifest = PackageManifest( resource_name=defaults.OCS_OPERATOR_NAME, selector=operator_selector, ) ocs_csv_name = ocs_package_manifest.get_current_csv() ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace) log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.") ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) # ocs-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OCS_OPERATOR_LABEL, timeout=timeout) # rook-ceph-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, timeout=timeout) # noobaa assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.NOOBAA_APP_LABEL, resource_count=2, timeout=timeout) # mons assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=timeout) # csi-cephfsplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-cephfsplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # csi-rbdplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-rbdplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # osds osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])) assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_count, timeout=timeout) # mgr assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, timeout=timeout) # mds assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MDS_APP_LABEL, resource_count=2, timeout=timeout) # rgw check only for VmWare if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=1, timeout=timeout) # Verify ceph health log.info("Verifying ceph health") assert utils.ceph_health_check(namespace=namespace) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSD's are distributed if not skip_osd_distribution_check: log.info("Verifying OSD's are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > 1, ( "OSD's are not distributed evenly across worker nodes") # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({ item['metadata']['name'] for item in csi_driver.get()['items'] }) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert sc_rbd['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ]) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ( 'snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") assert { 'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false' } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec'] ['template']['spec']['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump', format='') pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [ rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone")
def uninstall_ocs(): """ The function uninstalls the OCS operator from a openshift cluster and removes all its settings and dependencies """ ocp_obj = ocp.OCP() log.info("deleting volume snapshots") vs_ocp_obj = ocp.OCP(kind=constants.VOLUMESNAPSHOT) vs_list = vs_ocp_obj.get(all_namespaces=True)["items"] for vs in vs_list: vs_obj = ocp.OCP(kind=constants.VOLUMESNAPSHOT, namespace=vs.get("metadata").get("namespace")) vs_obj.delete(resource_name=vs.get("metadata").get("name")) log.info("queering for OCS PVCs") provisioners = constants.OCS_PROVISIONERS sc_list = [ sc for sc in get_all_storageclass() if sc.get("provisioner") in provisioners ] pvc_to_delete = [] for sc in sc_list: pvc_to_delete.extend(pvc for pvc in get_all_pvcs_in_storageclass( sc.get("metadata").get("name")) if "noobaa" not in pvc.name) if config.ENV_DATA["platform"].lower() == constants.ROSA_PLATFORM: log.info("Deleting OCS PVCs") for pvc in pvc_to_delete: log.info(f"Deleting PVC: {pvc.name}") pvc.delete() rosa.delete_odf_addon(config.ENV_DATA["cluster_name"]) return None log.info("Removing monitoring stack from OpenShift Container Storage") remove_monitoring_stack_from_ocs() log.info( "Removing OpenShift Container Platform registry from OpenShift Container Storage" ) remove_ocp_registry_from_ocs(config.ENV_DATA["platform"]) log.info( "Removing the cluster logging operator from OpenShift Container Storage" ) try: remove_cluster_logging_operator_from_ocs() except CommandFailed: log.info("No cluster logging found") log.info("Deleting OCS PVCs") for pvc in pvc_to_delete: log.info(f"Deleting PVC: {pvc.name}") pvc.delete() storage_cluster = ocp.OCP( kind=constants.STORAGECLUSTER, resource_name=constants.DEFAULT_CLUSTERNAME, namespace="openshift-storage", ) log.info("Checking for local storage") lso_sc = None if check_local_volume_local_volume_set(): "Local volume was found. Will be removed later" lso_sc = (storage_cluster.get().get("spec").get("storageDeviceSets")[0] .get("dataPVCTemplate").get("spec").get("storageClassName")) cleanup_policy = (storage_cluster.get().get("metadata").get( "annotations").get("uninstall.ocs.openshift.io/cleanup-policy")) log.info("Deleting storageCluster object") storage_cluster.delete(resource_name=constants.DEFAULT_CLUSTERNAME) if cleanup_policy == "delete": log.info("Cleanup policy set to delete. checking cleanup pods") cleanup_pods = [ pod for pod in get_all_pods() if "cluster-cleanup-job" in pod.name ] for pod in cleanup_pods: while pod.get().get("status").get("phase") != "Succeeded": log.info(f"waiting for cleanup pod {pod.name} to complete") TimeoutSampler(timeout=10, sleep=30) log.info(f"Cleanup pod {pod.name} completed successfully ") # no need to confirm var/vib/rook was deleted from nodes if all cleanup pods are completed. else: log.info("Cleanup policy set to retain. skipping nodes cleanup") log.info("Deleting openshift-storage namespace") ocp_obj.delete_project(constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.wait_for_delete(constants.OPENSHIFT_STORAGE_NAMESPACE) switch_to_project(constants.DEFAULT_NAMESPACE) # step 10: TODO remove crypto from nodes. """for node in storage_node_list: log.info(f"removing encryption from {node}") ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=[])""" if lso_sc is not None: log.info("Removing LSO") try: uninstall_lso(lso_sc) except Exception as e: log.info(f"LSO removal failed.{e}") log.info("deleting noobaa storage class") noobaa_sc = ocp.OCP(kind=constants.STORAGECLASS) noobaa_sc.delete(resource_name=constants.NOOBAA_SC) nodes = get_all_nodes() node_objs = get_node_objs(nodes) log.info("Unlabeling storage nodes") label_nodes(nodes=node_objs, label=constants.OPERATOR_NODE_LABEL[:-3] + "-") label_nodes(nodes=node_objs, label=constants.TOPOLOGY_ROOK_LABEL + "-") log.info("Removing taints from storage nodes") taint_nodes(nodes=nodes, taint_label=constants.OPERATOR_NODE_TAINT + "-") log.info("Deleting remaining OCS PVs (if there are any)") try: rbd_pv = ocp.OCP(kind=constants.PV, resource_name="ocs-storagecluster-ceph-rbd") fs_pv = ocp.OCP(kind=constants.PV, resource_name="ocs-storagecluster-cephfs") rbd_pv.delete() fs_pv.delete() log.info("OCS PVs deleted") except Exception as e: log.info(f"OCS PV(s) not found. {e}") log.info("Removing CRDs") crd_list = [ "backingstores.noobaa.io", "bucketclasses.noobaa.io", "cephblockpools.ceph.rook.io", "cephclusters.ceph.rook.io", "cephfilesystems.ceph.rook.io", "cephnfses.ceph.rook.io", "cephobjectstores.ceph.rook.io", "cephobjectstoreusers.ceph.rook.io", "noobaas.noobaa.io", "ocsinitializations.ocs.openshift.io", "storageclusters.ocs.openshift.io", "cephclients.ceph.rook.io", "cephobjectrealms.ceph.rook.io", "cephobjectzonegroups.ceph.rook.io", "cephobjectzones.ceph.rook.io", "cephrbdmirrors.ceph.rook.io", ] for crd in crd_list: try: ocp_obj.exec_oc_cmd(f"delete crd {crd} --timeout=300m") except Exception: log.info(f"crd {crd} was not found")
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default() if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU, min_memory=constants.MIN_NODE_MEMORY): logger.info("The cluster specs meet the minimum requirements and " "therefore, NooBaa auto scale will be enabled") min_nb_eps = config.DEPLOYMENT.get('min_noobaa_endpoints') max_nb_eps = config.DEPLOYMENT.get('max_noobaa_endpoints') change_noobaa_endpoints_count(min_nb_eps=min_nb_eps, max_nb_eps=max_nb_eps) else: logger.warning( "The cluster specs do not meet the minimum requirements" " and therefore, NooBaa auto scale will remain disabled") change_noobaa_endpoints_count(min_nb_eps=1, max_nb_eps=1)
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, version_before_upgrade=None, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. version_before_upgrade (float): Set to OCS version before upgrade """ from ocs_ci.ocs.node import get_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_nodes()) namespace = config.ENV_DATA["cluster_namespace"] log.info("Verifying OCS installation") if config.ENV_DATA.get("disable_components"): for component in config.ENV_DATA["disable_components"]: config.COMPONENTS[f"disable_{component}"] = True disable_noobaa = config.COMPONENTS["disable_noobaa"] disable_rgw = config.COMPONENTS["disable_rgw"] disable_blockpools = config.COMPONENTS["disable_blockpools"] disable_cephfs = config.COMPONENTS["disable_cephfs"] # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data["spec"]["version"] ocs_version = version.get_semantic_ocs_version_from_config() log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ( f"{ocs_version}" in csv_version ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( "ocs_registry_image") if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.rsplit(":", 1)[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch") if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify Storage System status if ocs_version >= version.VERSION_4_9: log.info("Verifying storage system status") storage_system = OCP(kind=constants.STORAGESYSTEM, namespace=namespace) storage_system_data = storage_system.get() storage_system_status = {} for condition in storage_system_data["items"][0]["status"][ "conditions"]: storage_system_status[condition["type"]] = condition["status"] log.debug(f"storage system status: {storage_system_status}") assert storage_system_status == constants.STORAGE_SYSTEM_STATUS, ( f"Storage System status is not in expected state. Expected {constants.STORAGE_SYSTEM_STATUS}" f" but found {storage_system_status}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA["storage_cluster_name"] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase="Ready", timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT["external_mode"]: osd_count = int( storage_cluster.data["spec"]["storageDeviceSets"][0]["count"] ) * int( storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"]) rgw_count = None if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: if not disable_rgw: rgw_count = get_rgw_count(f"{ocs_version}", post_upgrade_verification, version_before_upgrade) min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT max_eps = (constants.MAX_NB_ENDPOINT_COUNT if ocs_version >= version.VERSION_4_6 else 1) if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM: min_eps = 1 max_eps = 1 nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER if ocs_version < version.VERSION_4_7 else constants.NOOBAA_DB_LABEL_47_AND_ABOVE) resources_dict = { nb_db_label: 1, constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps, } if not config.DEPLOYMENT["external_mode"]: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count, }) if ocs_version >= version.VERSION_4_9: resources_dict.update({ constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1, }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if (not config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS or disable_rgw): continue if "noobaa" in label and disable_noobaa: continue if "mds" in label and disable_cephfs: continue assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout, ) if not disable_noobaa: nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})" ) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA["storage_cluster_name"] required_storage_classes = { f"{storage_cluster_name}-cephfs", f"{storage_cluster_name}-ceph-rbd", } if ocs_version >= version.VERSION_4_10: # TODO: Add rbd-thick storage class verification in external mode cluster upgraded # to OCS 4.8 when the bug 1978542 is fixed # Skip rbd-thick storage class verification in external mode upgraded cluster. This is blocked by bug 1978542 if not (config.DEPLOYMENT["external_mode"] and post_upgrade_verification): required_storage_classes.update( {f"{storage_cluster_name}-ceph-rbd-thick"}) skip_storage_classes = set() if disable_cephfs: skip_storage_classes.update({ f"{storage_cluster_name}-cephfs", }) if disable_blockpools: skip_storage_classes.update({ f"{storage_cluster_name}-ceph-rbd", }) required_storage_classes = required_storage_classes.difference( skip_storage_classes) if config.DEPLOYMENT["external_mode"]: required_storage_classes.update({ f"{storage_cluster_name}-ceph-rgw", f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io', }) storage_classes = storage_class.get() storage_class_names = { item["metadata"]["name"] for item in storage_classes["items"] } # required storage class names should be observed in the cluster under test missing_scs = required_storage_classes.difference(storage_class_names) if len(missing_scs) > 0: log.error("few storage classess are not present: %s", missing_scs) assert list(missing_scs) == [] # Verify OSDs are distributed if not config.DEPLOYMENT["external_mode"]: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] deviceset_count = get_deviceset_count() node_names = [osd["spec"]["nodeName"] for osd in osds] for node in node_names: assert ( not node_names.count(node) > deviceset_count ), "OSD's are not distributed evenly across worker nodes" # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = { item["metadata"]["name"] for item in csi_driver.get()["items"] } assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT["external_mode"]: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: if not disable_blockpools: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) if not disable_cephfs: sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) if not disable_blockpools: assert ( sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.RBD_NODE_SECRET) assert ( sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.RBD_PROVISIONER_SECRET) if not disable_cephfs: assert (sc_cephfs["parameters"] ["csi.storage.k8s.io/node-stage-secret-name"] == constants.CEPHFS_NODE_SECRET) assert (sc_cephfs["parameters"] ["csi.storage.k8s.io/provisioner-secret-name"] == constants.CEPHFS_PROVISIONER_SECRET) log.info("Verified node and provisioner secret names in storage class.") ct_pod = get_ceph_tools_pod() # https://github.com/red-hat-storage/ocs-ci/issues/3820 # Verify ceph osd tree output if not (config.DEPLOYMENT.get("ui_deployment") or config.DEPLOYMENT["external_mode"]): log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get("local_storage"): deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()] # removes duplicate hostname deviceset_pvcs = list(set(deviceset_pvcs)) if config.ENV_DATA.get("platform") == constants.BAREMETAL_PLATFORM: deviceset_pvcs = [ deviceset.replace(".", "-") for deviceset in deviceset_pvcs ] else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree", format="json") schemas = { "root": constants.OSD_TREE_ROOT, "rack": constants.OSD_TREE_RACK, "host": constants.OSD_TREE_HOST, "osd": constants.OSD_TREE_OSD, "region": constants.OSD_TREE_REGION, "zone": constants.OSD_TREE_ZONE, } schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs} for item in osd_tree["nodes"]: validate(instance=item, schema=schemas[item["type"]]) if item["type"] == "host": deviceset_pvcs.remove(item["name"]) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if ocs_version < version.VERSION_4_6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, ], ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ("snapshot" not in container) and ( "snapshot" not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val["name"] == "rook-ceph-operator" ] assert { "name": "CSI_ENABLE_SNAPSHOTTER", "value": "false" } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"] ["containers"][0]["env"] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump", format="") pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL, ] crush_rules = [ rule for rule in crush_dump["rules"] if rule["rule_name"] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule["steps"] if item.get("type") == "zone" ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get("fips"): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled() if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() if config.DEPLOYMENT.get("kms_deployment"): kms = KMS.get_kms_deployment() kms.post_deploy_verification() storage_cluster_obj = get_storage_cluster() is_flexible_scaling = ( storage_cluster_obj.get()["items"][0].get("spec").get( "flexibleScaling", False)) if is_flexible_scaling is True: failure_domain = storage_cluster_obj.data["items"][0]["status"][ "failureDomain"] assert failure_domain == "host", ( f"The expected failure domain on cluster with flexible scaling is 'host'," f" the actaul failure domain is {failure_domain}") if ocs_version >= version.VERSION_4_7: log.info("Verifying images in storage cluster") verify_sc_images(storage_cluster) if config.ENV_DATA.get("is_multus_enabled"): verify_multus_network()
def test_pvc_disruptive(self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory): """ Base function for PVC disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { 'mds': partial(pod.get_mds_pods), 'mon': partial(pod.get_mon_pods), 'mgr': partial(pod.get_mgr_pods), 'osd': partial(pod.get_osd_pods), 'rbdplugin': partial(pod.get_plugin_pods, interface=interface), 'cephfsplugin': partial(pod.get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial(pod.get_cephfsplugin_provisioner_pods), 'rbdplugin_provisioner': partial(pod.get_rbdfsplugin_provisioner_pods), 'operator': partial(pod.get_operator_pods) } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) num_of_pvc = 12 namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)['items']) executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) DISRUPTION_OPS.set_resource(resource=resource_to_delete) access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend([ f'{constants.ACCESS_MODE_RWO}-Block', f'{constants.ACCESS_MODE_RWX}-Block' ]) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=5, access_modes=access_modes, access_modes_selection='distribute_random', status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False) if operation_to_disrupt == 'create_pvc': # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, 'increase') assert ret, "Wait timeout: PVCs are not being created." logger.info(f"PVCs creation has started.") DISRUPTION_OPS.delete_resource() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120) pvc_obj.reload() logger.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs, pod_factory, interface, 2) if operation_to_disrupt == 'create_pod': # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, 'increase') assert ret, "Wait timeout: Pods are not being created." logger.info(f"Pods creation has started.") DISRUPTION_OPS.delete_resource() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() logger.info("Verified: All pods are Running.") # Do setup on pods for running IO logger.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: for sample in TimeoutSampler(180, 2, getattr, pod_obj, 'wl_setup_done'): if sample: logger.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break logger.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io(storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file1') logger.info("FIO started on all pods.") if operation_to_disrupt == 'run_io': DISRUPTION_OPS.delete_resource() logger.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}") logger.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() logging.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io(storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file2') logger.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}") logger.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) logger.info("Ceph cluster health is OK")
def test_non_ocs_taint_and_tolerations(self): """ Test runs the following steps 1. Taint ocs nodes with non-ocs taint 2. Set tolerations on storagecluster, subscription, configmap and ocsinit 3. Respin all ocs pods and check if it runs on ocs nodes with tolerations 4. Add Capacity """ # Taint all nodes with non-ocs taint ocs_nodes = get_worker_nodes() taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule") # Add tolerations to the storagecluster storagecluster_obj = ocp.OCP( resource_name=constants.DEFAULT_CLUSTERNAME, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.STORAGECLUSTER, ) tolerations = ( '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",' '"operator": "Equal", "value": "true"}, ' '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", ' '"operator": "Equal", "value": "true"}]}') param = ( f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, ' f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}') storagecluster_obj.patch(params=param, format_type="merge") # Add tolerations to the subscription sub_list = ocp.get_all_resource_names_of_a_kind( kind=constants.SUBSCRIPTION) param = ( '{"spec": {"config": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}}') for sub in sub_list: sub_obj = ocp.OCP( resource_name=sub, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.SUBSCRIPTION, ) sub_obj.patch(params=param, format_type="merge") # Add tolerations to the ocsinitializations.ocs.openshift.io param = ( '{"spec": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}') ocsini_obj = ocp.OCP( resource_name=constants.OCSINIT, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.OCSINITIALIZATION, ) ocsini_obj.patch(params=param, format_type="merge") # Add tolerations to the configmap rook-ceph-operator-config configmap_obj = ocp.OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, resource_name=constants.ROOK_OPERATOR_CONFIGMAP, ) toleration = configmap_obj.get().get("data").get( "CSI_PLUGIN_TOLERATIONS") toleration += ( '\n- key: xyz\n operator: Equal\n value: "true"\n effect: NoSchedule' ) toleration = toleration.replace('"', '\\"').replace("\n", "\\n") param_cmd = ( f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, ' f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]' ) configmap_obj.patch(params=param_cmd, format_type="json") # After edit noticed few pod respins as expected assert wait_for_pods_to_be_running(timeout=600, sleep=15) # Respin all pods and check it if is still running pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) for pod in pod_list: pod.delete(wait=False) assert wait_for_pods_to_be_running(timeout=600, sleep=15) self.sanity_helpers.health_check() # Add capacity to check if new osds has toleration osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = ocp.OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 assert pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=count * replica_count, ), "New OSDs failed to reach running state" check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)