def add_disk_for_vsphere_platform(): """ Add RDM/VMDK disk for vSphere platform """ platform = config.ENV_DATA.get("platform").lower() lso_type = config.DEPLOYMENT.get("type") if platform == constants.VSPHERE_PLATFORM: # Types of LSO Deployment # Importing here to avoid circular dependency from ocs_ci.deployment.vmware import VSPHEREBASE vsphere_base = VSPHEREBASE() if lso_type == constants.RDM: logger.info(f"LSO Deployment type: {constants.RDM}") vsphere_base.add_rdm_disks() if lso_type == constants.VMDK: logger.info(f"LSO Deployment type: {constants.VMDK}") vsphere_base.attach_disk( config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE), config.DEPLOYMENT.get("provision_type", constants.VM_DISK_TYPE), ) if lso_type == constants.DIRECTPATH: logger.info(f"LSO Deployment type: {constants.DIRECTPATH}") vsphere_base.add_pci_devices() # wipe partition table on newly added PCI devices compute_nodes = get_compute_node_names() for compute_node in compute_nodes: wipe_all_disk_partitions_for_node(compute_node)
def setup_local_storage(storageclass): """ Setup the necessary resources for enabling local storage. Args: storageclass (string): storageClassName value to be used in LocalVolume CR based on LOCAL_VOLUME_YAML """ # Get the worker nodes workers = get_nodes(node_type="worker") worker_names = [worker.name for worker in workers] logger.debug("Workers: %s", worker_names) ocp_version = version.get_semantic_ocp_version_from_config() ocs_version = version.get_semantic_ocs_version_from_config() ocp_ga_version = get_ocp_ga_version(ocp_version) if not ocp_ga_version: optional_operators_data = list( templating.load_yaml(constants.LOCAL_STORAGE_OPTIONAL_OPERATORS, multi_document=True)) optional_operators_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="optional_operators", delete=False) if config.DEPLOYMENT.get("optional_operators_image"): for _dict in optional_operators_data: if _dict.get("kind").lower() == "catalogsource": _dict["spec"]["image"] = config.DEPLOYMENT.get( "optional_operators_image") if config.DEPLOYMENT.get("disconnected"): # in case of disconnected environment, we have to mirror all the # optional_operators images icsp = None for _dict in optional_operators_data: if _dict.get("kind").lower() == "catalogsource": index_image = _dict["spec"]["image"] if _dict.get("kind").lower() == "imagecontentsourcepolicy": icsp = _dict mirrored_index_image = (f"{config.DEPLOYMENT['mirror_registry']}/" f"{index_image.split('/', 1)[-1]}") prune_and_mirror_index_image( index_image, mirrored_index_image, constants.DISCON_CL_REQUIRED_PACKAGES, icsp, ) _dict["spec"]["image"] = mirrored_index_image templating.dump_data_to_temp_yaml(optional_operators_data, optional_operators_yaml.name) with open(optional_operators_yaml.name, "r") as f: logger.info(f.read()) logger.info( "Creating optional operators CatalogSource and ImageContentSourcePolicy" ) run_cmd(f"oc create -f {optional_operators_yaml.name}") logger.info( "Sleeping for 60 sec to start update machineconfigpool status") # sleep here to start update machineconfigpool status time.sleep(60) wait_for_machineconfigpool_status("all") logger.info("Retrieving local-storage-operator data from yaml") lso_data = list( templating.load_yaml(constants.LOCAL_STORAGE_OPERATOR, multi_document=True)) # ensure namespace is correct lso_namespace = config.ENV_DATA["local_storage_namespace"] for data in lso_data: if data["kind"] == "Namespace": data["metadata"]["name"] = lso_namespace else: data["metadata"]["namespace"] = lso_namespace if data["kind"] == "OperatorGroup": data["spec"]["targetNamespaces"] = [lso_namespace] # Update local-storage-operator subscription data with channel for data in lso_data: if data["kind"] == "Subscription": data["spec"]["channel"] = get_lso_channel() if not ocp_ga_version: if data["kind"] == "Subscription": data["spec"]["source"] = "optional-operators" # Create temp yaml file and create local storage operator logger.info( "Creating temp yaml file with local-storage-operator data:\n %s", lso_data) lso_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="local_storage_operator", delete=False) templating.dump_data_to_temp_yaml(lso_data, lso_data_yaml.name) with open(lso_data_yaml.name, "r") as f: logger.info(f.read()) logger.info("Creating local-storage-operator") run_cmd(f"oc create -f {lso_data_yaml.name}") local_storage_operator = ocp.OCP(kind=constants.POD, namespace=lso_namespace) assert local_storage_operator.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.LOCAL_STORAGE_OPERATOR_LABEL, timeout=600, ), "Local storage operator did not reach running phase" # Add disks for vSphere/RHV platform platform = config.ENV_DATA.get("platform").lower() lso_type = config.DEPLOYMENT.get("type") if platform == constants.VSPHERE_PLATFORM: add_disk_for_vsphere_platform() if platform == constants.RHV_PLATFORM: add_disk_for_rhv_platform() if (ocp_version >= version.VERSION_4_6) and (ocs_version >= version.VERSION_4_6): # Pull local volume discovery yaml data logger.info("Pulling LocalVolumeDiscovery CR data from yaml") lvd_data = templating.load_yaml(constants.LOCAL_VOLUME_DISCOVERY_YAML) # Set local-volume-discovery namespace lvd_data["metadata"]["namespace"] = lso_namespace worker_nodes = get_compute_node_names(no_replace=True) # Update local volume discovery data with Worker node Names logger.info( "Updating LocalVolumeDiscovery CR data with worker nodes Name: %s", worker_nodes, ) lvd_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0][ "matchExpressions"][0]["values"] = worker_nodes lvd_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="local_volume_discovery", delete=False) templating.dump_data_to_temp_yaml(lvd_data, lvd_data_yaml.name) logger.info("Creating LocalVolumeDiscovery CR") run_cmd(f"oc create -f {lvd_data_yaml.name}") # Pull local volume set yaml data logger.info("Pulling LocalVolumeSet CR data from yaml") lvs_data = templating.load_yaml(constants.LOCAL_VOLUME_SET_YAML) # Since we don't have datastore with SSD on our current VMware machines, localvolumeset doesn't detect # NonRotational disk. As a workaround we are setting Rotational to device MechanicalProperties to detect # HDD disk if platform == constants.VSPHERE_PLATFORM or config.ENV_DATA.get( "local_storage_allow_rotational_disks"): logger.info("Adding Rotational for deviceMechanicalProperties spec" " to detect HDD disk") lvs_data["spec"]["deviceInclusionSpec"][ "deviceMechanicalProperties"].append("Rotational") # Update local volume set data with Worker node Names logger.info( "Updating LocalVolumeSet CR data with worker nodes Name: %s", worker_nodes) lvs_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0][ "matchExpressions"][0]["values"] = worker_nodes # Set storage class logger.info( "Updating LocalVolumeSet CR data with LSO storageclass: %s", storageclass) lvs_data["spec"]["storageClassName"] = storageclass # set volumeMode to Filesystem for MCG only deployment if config.ENV_DATA["mcg_only_deployment"]: lvs_data["spec"]["volumeMode"] = constants.VOLUME_MODE_FILESYSTEM lvs_data_yaml = tempfile.NamedTemporaryFile(mode="w+", prefix="local_volume_set", delete=False) templating.dump_data_to_temp_yaml(lvs_data, lvs_data_yaml.name) logger.info("Creating LocalVolumeSet CR") run_cmd(f"oc create -f {lvs_data_yaml.name}") else: # Retrieve NVME device path ID for each worker node device_paths = get_device_paths(worker_names) # Pull local volume yaml data logger.info("Pulling LocalVolume CR data from yaml") lv_data = templating.load_yaml(constants.LOCAL_VOLUME_YAML) # Set local-volume namespace lv_data["metadata"]["namespace"] = lso_namespace # Set storage class logger.info("Updating LocalVolume CR data with LSO storageclass: %s", storageclass) for scd in lv_data["spec"]["storageClassDevices"]: scd["storageClassName"] = storageclass # Update local volume data with NVME IDs logger.info("Updating LocalVolume CR data with device paths: %s", device_paths) lv_data["spec"]["storageClassDevices"][0]["devicePaths"] = device_paths # Create temp yaml file and create local volume lv_data_yaml = tempfile.NamedTemporaryFile(mode="w+", prefix="local_volume", delete=False) templating.dump_data_to_temp_yaml(lv_data, lv_data_yaml.name) logger.info("Creating LocalVolume CR") run_cmd(f"oc create -f {lv_data_yaml.name}") logger.info("Waiting 30 seconds for PVs to create") storage_class_device_count = 1 if platform == constants.AWS_PLATFORM and not lso_type == constants.AWS_EBS: storage_class_device_count = 2 elif platform == constants.IBM_POWER_PLATFORM: numberofstoragedisks = config.ENV_DATA.get("number_of_storage_disks", 1) storage_class_device_count = numberofstoragedisks elif platform == constants.VSPHERE_PLATFORM: # extra_disks is used in vSphere attach_disk() method storage_class_device_count = config.ENV_DATA.get("extra_disks", 1) expected_pvs = len(worker_names) * storage_class_device_count verify_pvs_created(expected_pvs, storageclass)
def setup_local_storage(storageclass): """ Setup the necessary resources for enabling local storage. Args: storageclass (string): storageClassName value to be used in LocalVolume CR based on LOCAL_VOLUME_YAML """ # Get the worker nodes workers = get_nodes(node_type="worker") worker_names = [worker.name for worker in workers] logger.debug("Workers: %s", worker_names) ocp_version = get_ocp_version() ocs_version = config.ENV_DATA.get("ocs_version") ocp_ga_version = get_ocp_ga_version(ocp_version) if not ocp_ga_version: optional_operators_data = templating.load_yaml( constants.LOCAL_STORAGE_OPTIONAL_OPERATORS, multi_document=True ) logger.info( "Creating temp yaml file with optional operators data:\n %s", optional_operators_data, ) optional_operators_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="optional_operators", delete=False ) templating.dump_data_to_temp_yaml( optional_operators_data, optional_operators_yaml.name ) with open(optional_operators_yaml.name, "r") as f: logger.info(f.read()) logger.info( "Creating optional operators CatalogSource and" " ImageContentSourcePolicy" ) run_cmd(f"oc create -f {optional_operators_yaml.name}") logger.info("Sleeping for 60 sec to start update machineconfigpool status") # sleep here to start update machineconfigpool status time.sleep(60) wait_for_machineconfigpool_status("all") logger.info("Retrieving local-storage-operator data from yaml") lso_data = list( templating.load_yaml(constants.LOCAL_STORAGE_OPERATOR, multi_document=True) ) # ensure namespace is correct lso_namespace = config.ENV_DATA["local_storage_namespace"] for data in lso_data: if data["kind"] == "Namespace": data["metadata"]["name"] = lso_namespace else: data["metadata"]["namespace"] = lso_namespace if data["kind"] == "OperatorGroup": data["spec"]["targetNamespaces"] = [lso_namespace] # Update local-storage-operator subscription data with channel for data in lso_data: if data["kind"] == "Subscription": data["spec"]["channel"] = get_lso_channel() if not ocp_ga_version: if data["kind"] == "Subscription": data["spec"]["source"] = "optional-operators" # Create temp yaml file and create local storage operator logger.info( "Creating temp yaml file with local-storage-operator data:\n %s", lso_data ) lso_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="local_storage_operator", delete=False ) templating.dump_data_to_temp_yaml(lso_data, lso_data_yaml.name) with open(lso_data_yaml.name, "r") as f: logger.info(f.read()) logger.info("Creating local-storage-operator") run_cmd(f"oc create -f {lso_data_yaml.name}") local_storage_operator = ocp.OCP(kind=constants.POD, namespace=lso_namespace) assert local_storage_operator.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.LOCAL_STORAGE_OPERATOR_LABEL, timeout=600, ), "Local storage operator did not reach running phase" # Add RDM disk for vSphere platform platform = config.ENV_DATA.get("platform").lower() lso_type = config.DEPLOYMENT.get("type") if platform == constants.VSPHERE_PLATFORM: # Types of LSO Deployment # Importing here to avoid circular dependency from ocs_ci.deployment.vmware import VSPHEREBASE vsphere_base = VSPHEREBASE() if lso_type == constants.RDM: logger.info(f"LSO Deployment type: {constants.RDM}") vsphere_base.add_rdm_disks() if lso_type == constants.VMDK: logger.info(f"LSO Deployment type: {constants.VMDK}") vsphere_base.attach_disk( config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE), config.DEPLOYMENT.get("provision_type", constants.VM_DISK_TYPE), ) if lso_type == constants.DIRECTPATH: raise NotImplementedError( "LSO Deployment for VMDirectPath is not implemented" ) if (ocp_version >= "4.6") and (ocs_version >= "4.6"): # Pull local volume discovery yaml data logger.info("Pulling LocalVolumeDiscovery CR data from yaml") lvd_data = templating.load_yaml(constants.LOCAL_VOLUME_DISCOVERY_YAML) # Set local-volume-discovery namespace lvd_data["metadata"]["namespace"] = lso_namespace worker_nodes = get_compute_node_names(no_replace=True) # Update local volume discovery data with Worker node Names logger.info( "Updating LocalVolumeDiscovery CR data with worker nodes Name: %s", worker_nodes, ) lvd_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0]["matchExpressions"][0][ "values" ] = worker_nodes lvd_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="local_volume_discovery", delete=False ) templating.dump_data_to_temp_yaml(lvd_data, lvd_data_yaml.name) logger.info("Creating LocalVolumeDiscovery CR") run_cmd(f"oc create -f {lvd_data_yaml.name}") # Pull local volume set yaml data logger.info("Pulling LocalVolumeSet CR data from yaml") lvs_data = templating.load_yaml(constants.LOCAL_VOLUME_SET_YAML) # Since we don't have datastore with SSD on our current VMware machines, localvolumeset doesn't detect # NonRotational disk. As a workaround we are setting Rotational to device MechanicalProperties to detect # HDD disk if platform == constants.VSPHERE_PLATFORM or config.ENV_DATA.get( "local_storage_allow_rotational_disks" ): logger.info( "Adding Rotational for deviceMechanicalProperties spec" " to detect HDD disk" ) lvs_data["spec"]["deviceInclusionSpec"][ "deviceMechanicalProperties" ].append("Rotational") # Update local volume set data with Worker node Names logger.info( "Updating LocalVolumeSet CR data with worker nodes Name: %s", worker_nodes ) lvs_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0]["matchExpressions"][0][ "values" ] = worker_nodes # Set storage class logger.info( "Updating LocalVolumeSet CR data with LSO storageclass: %s", storageclass ) lvs_data["spec"]["storageClassName"] = storageclass lvs_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="local_volume_set", delete=False ) templating.dump_data_to_temp_yaml(lvs_data, lvs_data_yaml.name) logger.info("Creating LocalVolumeSet CR") run_cmd(f"oc create -f {lvs_data_yaml.name}") else: # Retrieve NVME device path ID for each worker node device_paths = get_device_paths(worker_names) # Pull local volume yaml data logger.info("Pulling LocalVolume CR data from yaml") lv_data = templating.load_yaml(constants.LOCAL_VOLUME_YAML) # Set local-volume namespace lv_data["metadata"]["namespace"] = lso_namespace # Set storage class logger.info( "Updating LocalVolume CR data with LSO storageclass: %s", storageclass ) for scd in lv_data["spec"]["storageClassDevices"]: scd["storageClassName"] = storageclass # Update local volume data with NVME IDs logger.info("Updating LocalVolume CR data with device paths: %s", device_paths) lv_data["spec"]["storageClassDevices"][0]["devicePaths"] = device_paths # Create temp yaml file and create local volume lv_data_yaml = tempfile.NamedTemporaryFile( mode="w+", prefix="local_volume", delete=False ) templating.dump_data_to_temp_yaml(lv_data, lv_data_yaml.name) logger.info("Creating LocalVolume CR") run_cmd(f"oc create -f {lv_data_yaml.name}") logger.info("Waiting 30 seconds for PVs to create") storage_class_device_count = 1 if platform == constants.AWS_PLATFORM: storage_class_device_count = 2 verify_pvs_created(len(worker_names) * storage_class_device_count)
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data['spec']['version'] ocs_version = config.ENV_DATA['ocs_version'] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}" ) assert ocs_version in csv_version, ( f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" ) # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( 'ocs_registry_image' ) if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}" ) ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch') if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV" ) else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}" ) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info( f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase" ) storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP( kind=constants.POD, namespace=namespace ) osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']) ) # check noobaa CR for min number of noobaa endpoint pods nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE) min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount') max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount') resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps } if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS: # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1 # post upgrade to OCS 4.5. Tracked with # https://github.com/red-hat-storage/ocs-ci/issues/2532 rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not ( post_upgrade_verification ) else 1 resources_dict.update({constants.RGW_APP_LABEL: rgw_count}) for label, count in resources_dict.items(): assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout ) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})" ) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP( kind=constants.STORAGECLASS, namespace=namespace ) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not skip_osd_distribution_check: log.info("Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] deviceset_count = get_deviceset_count() node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > deviceset_count, ( "OSD's are not distributed evenly across worker nodes" ) # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ( {item['metadata']['name'] for item in csi_driver.get()['items']} ) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD ) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS ) assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output." ) if ( config.DEPLOYMENT.get('local_storage') and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM ): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}" ) log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output." ) # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ] ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ('snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}" ) deployments = ocs_csv.get()['spec']['install']['spec']['deployments'] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator' ] assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in ( rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd( ceph_cmd='ceph osd crush dump', format='' ) pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check( namespace, health_check_tries, health_check_delay )
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, version_before_upgrade=None, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. version_before_upgrade (float): Set to OCS version before upgrade """ from ocs_ci.ocs.node import get_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_nodes()) namespace = config.ENV_DATA["cluster_namespace"] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data["spec"]["version"] ocs_version = config.ENV_DATA["ocs_version"] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ( ocs_version in csv_version ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( "ocs_registry_image") if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch") if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA["storage_cluster_name"] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase="Ready", timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT["external_mode"]: osd_count = int( storage_cluster.data["spec"]["storageDeviceSets"][0]["count"] ) * int( storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"]) rgw_count = None if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: # RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4 if (float(config.ENV_DATA["ocs_version"]) < 4.5 or float(config.ENV_DATA["ocs_version"]) == 4.5 and (post_upgrade_verification and float(version_before_upgrade) < 4.5)): rgw_count = 1 else: rgw_count = 2 # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM: if float(config.ENV_DATA["ocs_version"]) == 4.4 or ( float(config.ENV_DATA["ocs_version"]) == 4.5 and (post_upgrade_verification and float(version_before_upgrade) < 4.5)): rgw_count = 1 min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT max_eps = (constants.MAX_NB_ENDPOINT_COUNT if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1) if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM: min_eps = 1 max_eps = 1 resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps, } if not config.DEPLOYMENT["external_mode"]: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count, }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if not config.ENV_DATA.get( "platform") in constants.ON_PREM_PLATFORMS: continue assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout, ) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})") # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA["storage_cluster_name"] required_storage_classes = { f"{storage_cluster_name}-cephfs", f"{storage_cluster_name}-ceph-rbd", } if config.DEPLOYMENT["external_mode"]: required_storage_classes.update({ f"{storage_cluster_name}-ceph-rgw", f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io', }) storage_classes = storage_class.get() storage_class_names = { item["metadata"]["name"] for item in storage_classes["items"] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not config.DEPLOYMENT["external_mode"]: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] deviceset_count = get_deviceset_count() node_names = [osd["spec"]["nodeName"] for osd in osds] for node in node_names: assert ( not node_names.count(node) > deviceset_count ), "OSD's are not distributed evenly across worker nodes" # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = { item["metadata"]["name"] for item in csi_driver.get()["items"] } assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT["external_mode"]: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.RBD_NODE_SECRET) assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.RBD_PROVISIONER_SECRET) assert ( sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.CEPHFS_NODE_SECRET) assert ( sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.CEPHFS_PROVISIONER_SECRET) log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output if not config.DEPLOYMENT["external_mode"]: log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get("local_storage"): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree", format="json") schemas = { "root": constants.OSD_TREE_ROOT, "rack": constants.OSD_TREE_RACK, "host": constants.OSD_TREE_HOST, "osd": constants.OSD_TREE_OSD, "region": constants.OSD_TREE_REGION, "zone": constants.OSD_TREE_ZONE, } schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs} for item in osd_tree["nodes"]: validate(instance=item, schema=schemas[item["type"]]) if item["type"] == "host": deviceset_pvcs.remove(item["name"]) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if float(config.ENV_DATA["ocs_version"]) < 4.6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, ], ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ("snapshot" not in container) and ( "snapshot" not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val["name"] == "rook-ceph-operator" ] assert { "name": "CSI_ENABLE_SNAPSHOTTER", "value": "false" } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"] ["containers"][0]["env"] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump", format="") pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL, ] crush_rules = [ rule for rule in crush_dump["rules"] if rule["rule_name"] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule["steps"] if item.get("type") == "zone" ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get("fips"): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled() if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()