Beispiel #1
0
def add_disk_for_vsphere_platform():
    """
    Add RDM/VMDK disk for vSphere platform

    """
    platform = config.ENV_DATA.get("platform").lower()
    lso_type = config.DEPLOYMENT.get("type")
    if platform == constants.VSPHERE_PLATFORM:
        # Types of LSO Deployment
        # Importing here to avoid circular dependency
        from ocs_ci.deployment.vmware import VSPHEREBASE

        vsphere_base = VSPHEREBASE()

        if lso_type == constants.RDM:
            logger.info(f"LSO Deployment type: {constants.RDM}")
            vsphere_base.add_rdm_disks()

        if lso_type == constants.VMDK:
            logger.info(f"LSO Deployment type: {constants.VMDK}")
            vsphere_base.attach_disk(
                config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE),
                config.DEPLOYMENT.get("provision_type",
                                      constants.VM_DISK_TYPE),
            )

        if lso_type == constants.DIRECTPATH:
            logger.info(f"LSO Deployment type: {constants.DIRECTPATH}")
            vsphere_base.add_pci_devices()

            # wipe partition table on newly added PCI devices
            compute_nodes = get_compute_node_names()
            for compute_node in compute_nodes:
                wipe_all_disk_partitions_for_node(compute_node)
Beispiel #2
0
def setup_local_storage(storageclass):
    """
    Setup the necessary resources for enabling local storage.

    Args:
        storageclass (string): storageClassName value to be used in
            LocalVolume CR based on LOCAL_VOLUME_YAML

    """
    # Get the worker nodes
    workers = get_nodes(node_type="worker")
    worker_names = [worker.name for worker in workers]
    logger.debug("Workers: %s", worker_names)

    ocp_version = version.get_semantic_ocp_version_from_config()
    ocs_version = version.get_semantic_ocs_version_from_config()
    ocp_ga_version = get_ocp_ga_version(ocp_version)
    if not ocp_ga_version:
        optional_operators_data = list(
            templating.load_yaml(constants.LOCAL_STORAGE_OPTIONAL_OPERATORS,
                                 multi_document=True))
        optional_operators_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="optional_operators", delete=False)
        if config.DEPLOYMENT.get("optional_operators_image"):
            for _dict in optional_operators_data:
                if _dict.get("kind").lower() == "catalogsource":
                    _dict["spec"]["image"] = config.DEPLOYMENT.get(
                        "optional_operators_image")
        if config.DEPLOYMENT.get("disconnected"):
            # in case of disconnected environment, we have to mirror all the
            # optional_operators images
            icsp = None
            for _dict in optional_operators_data:
                if _dict.get("kind").lower() == "catalogsource":
                    index_image = _dict["spec"]["image"]
                if _dict.get("kind").lower() == "imagecontentsourcepolicy":
                    icsp = _dict
            mirrored_index_image = (f"{config.DEPLOYMENT['mirror_registry']}/"
                                    f"{index_image.split('/', 1)[-1]}")
            prune_and_mirror_index_image(
                index_image,
                mirrored_index_image,
                constants.DISCON_CL_REQUIRED_PACKAGES,
                icsp,
            )
            _dict["spec"]["image"] = mirrored_index_image
        templating.dump_data_to_temp_yaml(optional_operators_data,
                                          optional_operators_yaml.name)
        with open(optional_operators_yaml.name, "r") as f:
            logger.info(f.read())
        logger.info(
            "Creating optional operators CatalogSource and ImageContentSourcePolicy"
        )
        run_cmd(f"oc create -f {optional_operators_yaml.name}")
        logger.info(
            "Sleeping for 60 sec to start update machineconfigpool status")
        # sleep here to start update machineconfigpool status
        time.sleep(60)
        wait_for_machineconfigpool_status("all")

    logger.info("Retrieving local-storage-operator data from yaml")
    lso_data = list(
        templating.load_yaml(constants.LOCAL_STORAGE_OPERATOR,
                             multi_document=True))

    # ensure namespace is correct
    lso_namespace = config.ENV_DATA["local_storage_namespace"]
    for data in lso_data:
        if data["kind"] == "Namespace":
            data["metadata"]["name"] = lso_namespace
        else:
            data["metadata"]["namespace"] = lso_namespace
        if data["kind"] == "OperatorGroup":
            data["spec"]["targetNamespaces"] = [lso_namespace]

    # Update local-storage-operator subscription data with channel
    for data in lso_data:
        if data["kind"] == "Subscription":
            data["spec"]["channel"] = get_lso_channel()
        if not ocp_ga_version:
            if data["kind"] == "Subscription":
                data["spec"]["source"] = "optional-operators"

    # Create temp yaml file and create local storage operator
    logger.info(
        "Creating temp yaml file with local-storage-operator data:\n %s",
        lso_data)
    lso_data_yaml = tempfile.NamedTemporaryFile(
        mode="w+", prefix="local_storage_operator", delete=False)
    templating.dump_data_to_temp_yaml(lso_data, lso_data_yaml.name)
    with open(lso_data_yaml.name, "r") as f:
        logger.info(f.read())
    logger.info("Creating local-storage-operator")
    run_cmd(f"oc create -f {lso_data_yaml.name}")

    local_storage_operator = ocp.OCP(kind=constants.POD,
                                     namespace=lso_namespace)
    assert local_storage_operator.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.LOCAL_STORAGE_OPERATOR_LABEL,
        timeout=600,
    ), "Local storage operator did not reach running phase"

    # Add disks for vSphere/RHV platform
    platform = config.ENV_DATA.get("platform").lower()
    lso_type = config.DEPLOYMENT.get("type")

    if platform == constants.VSPHERE_PLATFORM:
        add_disk_for_vsphere_platform()

    if platform == constants.RHV_PLATFORM:
        add_disk_for_rhv_platform()

    if (ocp_version >= version.VERSION_4_6) and (ocs_version >=
                                                 version.VERSION_4_6):
        # Pull local volume discovery yaml data
        logger.info("Pulling LocalVolumeDiscovery CR data from yaml")
        lvd_data = templating.load_yaml(constants.LOCAL_VOLUME_DISCOVERY_YAML)
        # Set local-volume-discovery namespace
        lvd_data["metadata"]["namespace"] = lso_namespace

        worker_nodes = get_compute_node_names(no_replace=True)

        # Update local volume discovery data with Worker node Names
        logger.info(
            "Updating LocalVolumeDiscovery CR data with worker nodes Name: %s",
            worker_nodes,
        )
        lvd_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0][
            "matchExpressions"][0]["values"] = worker_nodes
        lvd_data_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="local_volume_discovery", delete=False)
        templating.dump_data_to_temp_yaml(lvd_data, lvd_data_yaml.name)

        logger.info("Creating LocalVolumeDiscovery CR")
        run_cmd(f"oc create -f {lvd_data_yaml.name}")

        # Pull local volume set yaml data
        logger.info("Pulling LocalVolumeSet CR data from yaml")
        lvs_data = templating.load_yaml(constants.LOCAL_VOLUME_SET_YAML)

        # Since we don't have datastore with SSD on our current VMware machines, localvolumeset doesn't detect
        # NonRotational disk. As a workaround we are setting Rotational to device MechanicalProperties to detect
        # HDD disk
        if platform == constants.VSPHERE_PLATFORM or config.ENV_DATA.get(
                "local_storage_allow_rotational_disks"):
            logger.info("Adding Rotational for deviceMechanicalProperties spec"
                        " to detect HDD disk")
            lvs_data["spec"]["deviceInclusionSpec"][
                "deviceMechanicalProperties"].append("Rotational")

        # Update local volume set data with Worker node Names
        logger.info(
            "Updating LocalVolumeSet CR data with worker nodes Name: %s",
            worker_nodes)
        lvs_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0][
            "matchExpressions"][0]["values"] = worker_nodes

        # Set storage class
        logger.info(
            "Updating LocalVolumeSet CR data with LSO storageclass: %s",
            storageclass)
        lvs_data["spec"]["storageClassName"] = storageclass

        # set volumeMode to Filesystem for MCG only deployment
        if config.ENV_DATA["mcg_only_deployment"]:
            lvs_data["spec"]["volumeMode"] = constants.VOLUME_MODE_FILESYSTEM

        lvs_data_yaml = tempfile.NamedTemporaryFile(mode="w+",
                                                    prefix="local_volume_set",
                                                    delete=False)
        templating.dump_data_to_temp_yaml(lvs_data, lvs_data_yaml.name)
        logger.info("Creating LocalVolumeSet CR")
        run_cmd(f"oc create -f {lvs_data_yaml.name}")
    else:
        # Retrieve NVME device path ID for each worker node
        device_paths = get_device_paths(worker_names)

        # Pull local volume yaml data
        logger.info("Pulling LocalVolume CR data from yaml")
        lv_data = templating.load_yaml(constants.LOCAL_VOLUME_YAML)

        # Set local-volume namespace
        lv_data["metadata"]["namespace"] = lso_namespace

        # Set storage class
        logger.info("Updating LocalVolume CR data with LSO storageclass: %s",
                    storageclass)
        for scd in lv_data["spec"]["storageClassDevices"]:
            scd["storageClassName"] = storageclass

        # Update local volume data with NVME IDs
        logger.info("Updating LocalVolume CR data with device paths: %s",
                    device_paths)
        lv_data["spec"]["storageClassDevices"][0]["devicePaths"] = device_paths

        # Create temp yaml file and create local volume
        lv_data_yaml = tempfile.NamedTemporaryFile(mode="w+",
                                                   prefix="local_volume",
                                                   delete=False)
        templating.dump_data_to_temp_yaml(lv_data, lv_data_yaml.name)
        logger.info("Creating LocalVolume CR")
        run_cmd(f"oc create -f {lv_data_yaml.name}")
    logger.info("Waiting 30 seconds for PVs to create")
    storage_class_device_count = 1
    if platform == constants.AWS_PLATFORM and not lso_type == constants.AWS_EBS:
        storage_class_device_count = 2
    elif platform == constants.IBM_POWER_PLATFORM:
        numberofstoragedisks = config.ENV_DATA.get("number_of_storage_disks",
                                                   1)
        storage_class_device_count = numberofstoragedisks
    elif platform == constants.VSPHERE_PLATFORM:
        # extra_disks is used in vSphere attach_disk() method
        storage_class_device_count = config.ENV_DATA.get("extra_disks", 1)
    expected_pvs = len(worker_names) * storage_class_device_count
    verify_pvs_created(expected_pvs, storageclass)
Beispiel #3
0
def setup_local_storage(storageclass):
    """
    Setup the necessary resources for enabling local storage.

    Args:
        storageclass (string): storageClassName value to be used in
            LocalVolume CR based on LOCAL_VOLUME_YAML

    """
    # Get the worker nodes
    workers = get_nodes(node_type="worker")
    worker_names = [worker.name for worker in workers]
    logger.debug("Workers: %s", worker_names)

    ocp_version = get_ocp_version()
    ocs_version = config.ENV_DATA.get("ocs_version")
    ocp_ga_version = get_ocp_ga_version(ocp_version)
    if not ocp_ga_version:
        optional_operators_data = templating.load_yaml(
            constants.LOCAL_STORAGE_OPTIONAL_OPERATORS, multi_document=True
        )
        logger.info(
            "Creating temp yaml file with optional operators data:\n %s",
            optional_operators_data,
        )
        optional_operators_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="optional_operators", delete=False
        )
        templating.dump_data_to_temp_yaml(
            optional_operators_data, optional_operators_yaml.name
        )
        with open(optional_operators_yaml.name, "r") as f:
            logger.info(f.read())
        logger.info(
            "Creating optional operators CatalogSource and" " ImageContentSourcePolicy"
        )
        run_cmd(f"oc create -f {optional_operators_yaml.name}")
        logger.info("Sleeping for 60 sec to start update machineconfigpool status")
        # sleep here to start update machineconfigpool status
        time.sleep(60)
        wait_for_machineconfigpool_status("all")

    logger.info("Retrieving local-storage-operator data from yaml")
    lso_data = list(
        templating.load_yaml(constants.LOCAL_STORAGE_OPERATOR, multi_document=True)
    )

    # ensure namespace is correct
    lso_namespace = config.ENV_DATA["local_storage_namespace"]
    for data in lso_data:
        if data["kind"] == "Namespace":
            data["metadata"]["name"] = lso_namespace
        else:
            data["metadata"]["namespace"] = lso_namespace
        if data["kind"] == "OperatorGroup":
            data["spec"]["targetNamespaces"] = [lso_namespace]

    # Update local-storage-operator subscription data with channel
    for data in lso_data:
        if data["kind"] == "Subscription":
            data["spec"]["channel"] = get_lso_channel()
        if not ocp_ga_version:
            if data["kind"] == "Subscription":
                data["spec"]["source"] = "optional-operators"

    # Create temp yaml file and create local storage operator
    logger.info(
        "Creating temp yaml file with local-storage-operator data:\n %s", lso_data
    )
    lso_data_yaml = tempfile.NamedTemporaryFile(
        mode="w+", prefix="local_storage_operator", delete=False
    )
    templating.dump_data_to_temp_yaml(lso_data, lso_data_yaml.name)
    with open(lso_data_yaml.name, "r") as f:
        logger.info(f.read())
    logger.info("Creating local-storage-operator")
    run_cmd(f"oc create -f {lso_data_yaml.name}")

    local_storage_operator = ocp.OCP(kind=constants.POD, namespace=lso_namespace)
    assert local_storage_operator.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.LOCAL_STORAGE_OPERATOR_LABEL,
        timeout=600,
    ), "Local storage operator did not reach running phase"

    # Add RDM disk for vSphere platform
    platform = config.ENV_DATA.get("platform").lower()
    lso_type = config.DEPLOYMENT.get("type")
    if platform == constants.VSPHERE_PLATFORM:
        # Types of LSO Deployment
        # Importing here to avoid circular dependency
        from ocs_ci.deployment.vmware import VSPHEREBASE

        vsphere_base = VSPHEREBASE()

        if lso_type == constants.RDM:
            logger.info(f"LSO Deployment type: {constants.RDM}")
            vsphere_base.add_rdm_disks()

        if lso_type == constants.VMDK:
            logger.info(f"LSO Deployment type: {constants.VMDK}")
            vsphere_base.attach_disk(
                config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE),
                config.DEPLOYMENT.get("provision_type", constants.VM_DISK_TYPE),
            )

        if lso_type == constants.DIRECTPATH:
            raise NotImplementedError(
                "LSO Deployment for VMDirectPath is not implemented"
            )
    if (ocp_version >= "4.6") and (ocs_version >= "4.6"):
        # Pull local volume discovery yaml data
        logger.info("Pulling LocalVolumeDiscovery CR data from yaml")
        lvd_data = templating.load_yaml(constants.LOCAL_VOLUME_DISCOVERY_YAML)
        # Set local-volume-discovery namespace
        lvd_data["metadata"]["namespace"] = lso_namespace

        worker_nodes = get_compute_node_names(no_replace=True)

        # Update local volume discovery data with Worker node Names
        logger.info(
            "Updating LocalVolumeDiscovery CR data with worker nodes Name: %s",
            worker_nodes,
        )
        lvd_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0]["matchExpressions"][0][
            "values"
        ] = worker_nodes
        lvd_data_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="local_volume_discovery", delete=False
        )
        templating.dump_data_to_temp_yaml(lvd_data, lvd_data_yaml.name)

        logger.info("Creating LocalVolumeDiscovery CR")
        run_cmd(f"oc create -f {lvd_data_yaml.name}")

        # Pull local volume set yaml data
        logger.info("Pulling LocalVolumeSet CR data from yaml")
        lvs_data = templating.load_yaml(constants.LOCAL_VOLUME_SET_YAML)

        # Since we don't have datastore with SSD on our current VMware machines, localvolumeset doesn't detect
        # NonRotational disk. As a workaround we are setting Rotational to device MechanicalProperties to detect
        # HDD disk
        if platform == constants.VSPHERE_PLATFORM or config.ENV_DATA.get(
            "local_storage_allow_rotational_disks"
        ):
            logger.info(
                "Adding Rotational for deviceMechanicalProperties spec"
                " to detect HDD disk"
            )
            lvs_data["spec"]["deviceInclusionSpec"][
                "deviceMechanicalProperties"
            ].append("Rotational")

        # Update local volume set data with Worker node Names
        logger.info(
            "Updating LocalVolumeSet CR data with worker nodes Name: %s", worker_nodes
        )
        lvs_data["spec"]["nodeSelector"]["nodeSelectorTerms"][0]["matchExpressions"][0][
            "values"
        ] = worker_nodes

        # Set storage class
        logger.info(
            "Updating LocalVolumeSet CR data with LSO storageclass: %s", storageclass
        )
        lvs_data["spec"]["storageClassName"] = storageclass

        lvs_data_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="local_volume_set", delete=False
        )
        templating.dump_data_to_temp_yaml(lvs_data, lvs_data_yaml.name)
        logger.info("Creating LocalVolumeSet CR")
        run_cmd(f"oc create -f {lvs_data_yaml.name}")
    else:
        # Retrieve NVME device path ID for each worker node
        device_paths = get_device_paths(worker_names)

        # Pull local volume yaml data
        logger.info("Pulling LocalVolume CR data from yaml")
        lv_data = templating.load_yaml(constants.LOCAL_VOLUME_YAML)

        # Set local-volume namespace
        lv_data["metadata"]["namespace"] = lso_namespace

        # Set storage class
        logger.info(
            "Updating LocalVolume CR data with LSO storageclass: %s", storageclass
        )
        for scd in lv_data["spec"]["storageClassDevices"]:
            scd["storageClassName"] = storageclass

        # Update local volume data with NVME IDs
        logger.info("Updating LocalVolume CR data with device paths: %s", device_paths)
        lv_data["spec"]["storageClassDevices"][0]["devicePaths"] = device_paths

        # Create temp yaml file and create local volume
        lv_data_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="local_volume", delete=False
        )
        templating.dump_data_to_temp_yaml(lv_data, lv_data_yaml.name)
        logger.info("Creating LocalVolume CR")
        run_cmd(f"oc create -f {lv_data_yaml.name}")
    logger.info("Waiting 30 seconds for PVs to create")
    storage_class_device_count = 1
    if platform == constants.AWS_PLATFORM:
        storage_class_device_count = 2
    verify_pvs_created(len(worker_names) * storage_class_device_count)
def ocs_install_verification(
    timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None,
    post_upgrade_verification=False,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data['spec']['version']
    ocs_version = config.ENV_DATA['ocs_version']
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}"
    )
    assert ocs_version in csv_version, (
        f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    )
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        'ocs_registry_image'
    )
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}"
        )
        ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch')
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV"
            )
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}"
            )

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(
        f"Check if StorageCluster: {storage_cluster_name} is in"
        f"Succeeded phase"
    )
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(
        kind=constants.POD, namespace=namespace
    )
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count'])
        * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])
    )

    # check noobaa CR for min number of noobaa endpoint pods
    nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount')
    max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount')

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.MON_APP_LABEL: 3,
        constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
        constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
        constants.OSD_APP_LABEL: osd_count,
        constants.MGR_APP_LABEL: 1,
        constants.MDS_APP_LABEL: 2,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps
    }
    if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS:
        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1
        # post upgrade to OCS 4.5. Tracked with
        # https://github.com/red-hat-storage/ocs-ci/issues/2532
        rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not (
            post_upgrade_verification
        ) else 1
        resources_dict.update({constants.RGW_APP_LABEL: rgw_count})
    for label, count in resources_dict.items():
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
    )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(
        kind=constants.STORAGECLASS, namespace=namespace
    )
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs',
        f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name'] for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSDs are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        deviceset_count = get_deviceset_count()
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > deviceset_count, (
                "OSD's are not distributed evenly across worker nodes"
            )

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == (
        {item['metadata']['name'] for item in csi_driver.get()['items']}
    )

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD
    )
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS
    )
    assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output."
    )

    if (
        config.DEPLOYMENT.get('local_storage')
        and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM
    ):
        deviceset_pvcs = get_compute_node_names()
    else:
        deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}"
    )
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output."
    )

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ]
    )
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and ('snapshot' not in image), (
                f"Snapshot container is present in {pod_obj.name} pod. "
                f"Container {container}. Image {image}"
            )
    deployments = ocs_csv.get()['spec']['install']['spec']['deployments']
    rook_ceph_operator_deployment = [
        deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator'
    ]
    assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in (
        rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env']
    ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(
            ceph_cmd='ceph osd crush dump', format=''
        )
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps'] if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(
        namespace, health_check_tries, health_check_delay
    )
Beispiel #5
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = config.ENV_DATA["ocs_version"]
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        ocs_version in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        #  RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4
        if (float(config.ENV_DATA["ocs_version"]) < 4.5
                or float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1
        else:
            rgw_count = 2

    # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore
    if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM:
        if float(config.ENV_DATA["ocs_version"]) == 4.4 or (
                float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if not config.ENV_DATA.get(
                    "platform") in constants.ON_PREM_PLATFORMS:
                continue
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL,
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})")

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] ==
        constants.CEPHFS_NODE_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
        == constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    if not config.DEPLOYMENT["external_mode"]:
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")

        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = get_compute_node_names()
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        ct_pod = get_ceph_tools_pod()
        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if float(config.ENV_DATA["ocs_version"]) < 4.6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()