Ejemplo n.º 1
0
def test_upgrade():
    ceph_cluster = CephCluster()
    ceph_cluster.enable_health_monitor()
    namespace = config.ENV_DATA['cluster_namespace']
    ocs_catalog = CatalogSource(
        resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME,
        namespace="openshift-marketplace",
    )
    image_url = ocs_catalog.get_image_url()
    image_tag = ocs_catalog.get_image_name()
    if config.DEPLOYMENT.get('upgrade_to_latest', True):
        new_image_tag = get_latest_ds_olm_tag()
    else:
        new_image_tag = get_next_version_available_for_upgrade(image_tag)
    cs_data = deepcopy(ocs_catalog.data)
    cs_data['spec']['image'] = ':'.join([image_url, new_image_tag])
    package_manifest = PackageManifest(resource_name=OCS_OPERATOR_NAME)
    csv_name_pre_upgrade = package_manifest.get_current_csv()
    log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
    csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade,
                          namespace=namespace)
    pre_upgrade_images = get_images(csv_pre_upgrade.get())

    with NamedTemporaryFile() as cs_yaml:
        dump_data_to_temp_yaml(cs_data, cs_yaml.name)
        ocs_catalog.apply(cs_yaml.name)
    # Wait for package manifest is ready
    package_manifest.wait_for_resource()
    subscription_plan_approval = config.DEPLOYMENT.get(
        'subscription_plan_approval')
    if subscription_plan_approval == 'Manual':
        wait_for_install_plan_and_approve(namespace)
    attempts = 145
    for attempt in range(1, attempts):
        if attempts == attempt:
            raise TimeoutException("No new CSV found after upgrade!")
        log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.")
        package_manifest.reload_data()
        csv_name_post_upgrade = package_manifest.get_current_csv()
        if csv_name_post_upgrade == csv_name_pre_upgrade:
            log.info(f"CSV is still: {csv_name_post_upgrade}")
            sleep(5)
        else:
            log.info(f"CSV now upgraded to: {csv_name_post_upgrade}")
            break
    csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade,
                           namespace=namespace)
    log.info(
        f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state")
    csv_post_upgrade.wait_for_phase("Succeeded", timeout=600)
    post_upgrade_images = get_images(csv_post_upgrade.get())
    old_images, _, _ = get_upgrade_image_info(pre_upgrade_images,
                                              post_upgrade_images)
    verify_image_versions(old_images)
    ocs_install_verification(timeout=600, skip_osd_distribution_check=True)
    ceph_cluster.disable_health_monitor()
    if ceph_cluster.health_error_status:
        CephHealthException(f"During upgrade hit Ceph HEALTH_ERROR: "
                            f"{ceph_cluster.health_error_status}")
Ejemplo n.º 2
0
def get_version_info(namespace=None):
    package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME)
    csv_name = package_manifest.get_current_csv()
    csv_pre = CSV(resource_name=csv_name, namespace=namespace)
    info = get_images(csv_pre.get())
    return info
Ejemplo n.º 3
0
def get_csv_version(channel):
    """
    Get the cluster-logging and Elasticsearch CSV version
    and Images of the pods

    Args:
        channel (str) : Logging Channel

    Returns:
        tuple: Tuple containing three elements
            cluster_logging_csv (str) : Name of the cluster-logging CSV
            elasticsearch_csv (str) : Name of the elasticsearch CSV
            dict: Images dict like: {'image_name': 'image.url.to:tag', ...}

    """

    clo_package_manifest = PackageManifest(resource_name="cluster-logging")
    cluster_logging_csv = clo_package_manifest.get_current_csv(channel)
    es_package_manifest = PackageManifest(resource_name="elasticsearch-operator")
    elasticsearch_csv = es_package_manifest.get_current_csv(channel)
    logging_csv = CSV(
        resource_name=cluster_logging_csv,
        namespace=constants.OPENSHIFT_LOGGING_NAMESPACE,
    )
    images = ocp.get_images(logging_csv.get())
    return cluster_logging_csv, elasticsearch_csv, images
Ejemplo n.º 4
0
def get_version_info(namespace=None):
    operator_selector = get_selector_for_ocs_operator()
    package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
    )
    csv_name = package_manifest.get_current_csv()
    csv_pre = CSV(resource_name=csv_name, namespace=namespace)
    info = get_images(csv_pre.get())
    return info
Ejemplo n.º 5
0
def get_version_info(namespace=None):
    operator_selector = get_selector_for_ocs_operator()
    subscription_plan_approval = config.DEPLOYMENT.get("subscription_plan_approval")
    package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
        subscription_plan_approval=subscription_plan_approval,
    )
    channel = config.DEPLOYMENT.get("ocs_csv_channel")
    csv_name = package_manifest.get_current_csv(channel)
    csv_pre = CSV(resource_name=csv_name, namespace=namespace)
    info = get_images(csv_pre.get())
    return info
Ejemplo n.º 6
0
    def get_images_post_upgrade(self, channel, pre_upgrade_images,
                                upgrade_version):
        """
        Checks if all images of OCS cluster upgraded,
            and return list of all images if upgrade success

        Args:
            channel: (str): OCS subscription channel
            pre_upgrade_images: (dict): Contains all OCS cluster images
            upgrade_version: (str): version to be upgraded

        Returns:
            set: Contains full path of OCS cluster old images

        """
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
            subscription_plan_approval=self.subscription_plan_approval,
        )
        csv_name_post_upgrade = package_manifest.get_current_csv(channel)
        csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade,
                               namespace=self.namespace)
        log.info(
            f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state"
        )

        # Workaround for patching missing ceph-rook-tools pod after upgrade
        if self.version_before_upgrade == "4.2" and upgrade_version == "4.3":
            log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3")
            setup_ceph_toolbox(force_setup=True)
        # End of workaround

        if config.DEPLOYMENT.get("external_mode") or config.ENV_DATA.get(
                "mcg_only_deployment"):
            timeout = 200
        else:
            timeout = 200 * get_osd_count()
        csv_post_upgrade.wait_for_phase("Succeeded", timeout=timeout)

        post_upgrade_images = get_images(csv_post_upgrade.get())
        old_images, _, _ = get_upgrade_image_info(pre_upgrade_images,
                                                  post_upgrade_images)

        return old_images
Ejemplo n.º 7
0
    def get_pre_upgrade_image(self, csv_name_pre_upgrade):
        """
        Getting all OCS cluster images, before upgrade

        Args:
            csv_name_pre_upgrade: (str): OCS operator name

        Returns:
            dict: Contains all OCS cluster images
                dict keys: Image name
                dict values: Image full path

        """
        csv_name_pre_upgrade = csv_name_pre_upgrade
        log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
        csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade,
                              namespace=self.namespace)
        return get_images(csv_pre_upgrade.get())
Ejemplo n.º 8
0
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    operator_selector = get_selector_for_ocs_operator()
    ocs_package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
    )
    ocs_csv_name = ocs_package_manifest.get_current_csv()
    ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace)
    log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.")
    ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout)

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    # ocs-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OCS_OPERATOR_LABEL,
                                 timeout=timeout)
    # rook-ceph-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OPERATOR_LABEL,
                                 timeout=timeout)
    # noobaa
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.NOOBAA_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)
    # mons
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MON_APP_LABEL,
                                 resource_count=3,
                                 timeout=timeout)
    # csi-cephfsplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_CEPHFSPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-cephfsplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # csi-rbdplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_RBDPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-rbdplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # osds
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OSD_APP_LABEL,
                                 resource_count=osd_count,
                                 timeout=timeout)
    # mgr
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL,
                                 timeout=timeout)
    # mds
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MDS_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)

    # rgw check only for VmWare
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector=constants.RGW_APP_LABEL,
                                     resource_count=1,
                                     timeout=timeout)

    # Verify ceph health
    log.info("Verifying ceph health")
    assert utils.ceph_health_check(namespace=namespace)

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name']
        for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSD's are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSD's are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > 1, (
                "OSD's are not distributed evenly across worker nodes")

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({
        item['metadata']['name']
        for item in csi_driver.get()['items']
    })

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD)
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output.")
    deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]
    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}")
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ])
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and (
                'snapshot' not in image), (
                    f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
    assert {
        'name': 'CSI_ENABLE_SNAPSHOTTER',
        'value': 'false'
    } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec']
          ['template']['spec']['containers'][0]['env']
          ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump',
                                          format='')
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [
            rule for rule in crush_dump['rules']
            if rule['rule_name'] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps']
                if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
Ejemplo n.º 9
0
def ocs_install_verification(
    timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None,
    post_upgrade_verification=False,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data['spec']['version']
    ocs_version = config.ENV_DATA['ocs_version']
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}"
    )
    assert ocs_version in csv_version, (
        f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    )
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        'ocs_registry_image'
    )
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}"
        )
        ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch')
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV"
            )
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}"
            )

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(
        f"Check if StorageCluster: {storage_cluster_name} is in"
        f"Succeeded phase"
    )
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(
        kind=constants.POD, namespace=namespace
    )
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count'])
        * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])
    )

    # check noobaa CR for min number of noobaa endpoint pods
    nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount')
    max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount')

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.MON_APP_LABEL: 3,
        constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
        constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
        constants.OSD_APP_LABEL: osd_count,
        constants.MGR_APP_LABEL: 1,
        constants.MDS_APP_LABEL: 2,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps
    }
    if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS:
        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1
        # post upgrade to OCS 4.5. Tracked with
        # https://github.com/red-hat-storage/ocs-ci/issues/2532
        rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not (
            post_upgrade_verification
        ) else 1
        resources_dict.update({constants.RGW_APP_LABEL: rgw_count})
    for label, count in resources_dict.items():
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
    )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(
        kind=constants.STORAGECLASS, namespace=namespace
    )
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs',
        f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name'] for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSDs are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        deviceset_count = get_deviceset_count()
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > deviceset_count, (
                "OSD's are not distributed evenly across worker nodes"
            )

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == (
        {item['metadata']['name'] for item in csi_driver.get()['items']}
    )

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD
    )
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS
    )
    assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output."
    )

    if (
        config.DEPLOYMENT.get('local_storage')
        and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM
    ):
        deviceset_pvcs = get_compute_node_names()
    else:
        deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}"
    )
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output."
    )

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ]
    )
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and ('snapshot' not in image), (
                f"Snapshot container is present in {pod_obj.name} pod. "
                f"Container {container}. Image {image}"
            )
    deployments = ocs_csv.get()['spec']['install']['spec']['deployments']
    rook_ceph_operator_deployment = [
        deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator'
    ]
    assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in (
        rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env']
    ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(
            ceph_cmd='ceph osd crush dump', format=''
        )
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps'] if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(
        namespace, health_check_tries, health_check_delay
    )
Ejemplo n.º 10
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")
    if config.ENV_DATA.get("disable_components"):
        for component in config.ENV_DATA["disable_components"]:
            config.COMPONENTS[f"disable_{component}"] = True
    disable_noobaa = config.COMPONENTS["disable_noobaa"]
    disable_rgw = config.COMPONENTS["disable_rgw"]
    disable_blockpools = config.COMPONENTS["disable_blockpools"]
    disable_cephfs = config.COMPONENTS["disable_cephfs"]

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = version.get_semantic_ocs_version_from_config()
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        f"{ocs_version}" in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.rsplit(":", 1)[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify Storage System status
    if ocs_version >= version.VERSION_4_9:
        log.info("Verifying storage system status")
        storage_system = OCP(kind=constants.STORAGESYSTEM, namespace=namespace)
        storage_system_data = storage_system.get()
        storage_system_status = {}
        for condition in storage_system_data["items"][0]["status"][
                "conditions"]:
            storage_system_status[condition["type"]] = condition["status"]
        log.debug(f"storage system status: {storage_system_status}")
        assert storage_system_status == constants.STORAGE_SYSTEM_STATUS, (
            f"Storage System status is not in expected state. Expected {constants.STORAGE_SYSTEM_STATUS}"
            f" but found {storage_system_status}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        if not disable_rgw:
            rgw_count = get_rgw_count(f"{ocs_version}",
                                      post_upgrade_verification,
                                      version_before_upgrade)

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if ocs_version >= version.VERSION_4_6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER
                   if ocs_version < version.VERSION_4_7 else
                   constants.NOOBAA_DB_LABEL_47_AND_ABOVE)
    resources_dict = {
        nb_db_label: 1,
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    if ocs_version >= version.VERSION_4_9:
        resources_dict.update({
            constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if (not config.ENV_DATA.get("platform")
                    in constants.ON_PREM_PLATFORMS or disable_rgw):
                continue
        if "noobaa" in label and disable_noobaa:
            continue
        if "mds" in label and disable_cephfs:
            continue

        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    if not disable_noobaa:
        nb_ep_pods = get_pods_having_label(
            label=constants.NOOBAA_ENDPOINT_POD_LABEL,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        )
        assert len(nb_ep_pods) <= max_eps, (
            f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
            f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
        )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if ocs_version >= version.VERSION_4_10:
        # TODO: Add rbd-thick storage class verification in external mode cluster upgraded
        # to OCS 4.8 when the bug 1978542 is fixed
        # Skip rbd-thick storage class verification in external mode upgraded cluster. This is blocked by bug 1978542
        if not (config.DEPLOYMENT["external_mode"]
                and post_upgrade_verification):
            required_storage_classes.update(
                {f"{storage_cluster_name}-ceph-rbd-thick"})
    skip_storage_classes = set()
    if disable_cephfs:
        skip_storage_classes.update({
            f"{storage_cluster_name}-cephfs",
        })
    if disable_blockpools:
        skip_storage_classes.update({
            f"{storage_cluster_name}-ceph-rbd",
        })
    required_storage_classes = required_storage_classes.difference(
        skip_storage_classes)

    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    # required storage class names should be observed in the cluster under test
    missing_scs = required_storage_classes.difference(storage_class_names)
    if len(missing_scs) > 0:
        log.error("few storage classess are not present: %s", missing_scs)
    assert list(missing_scs) == []

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        if not disable_blockpools:
            sc_rbd = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        if not disable_cephfs:
            sc_cephfs = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    if not disable_blockpools:
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    if not disable_cephfs:
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/node-stage-secret-name"] ==
                constants.CEPHFS_NODE_SECRET)
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/provisioner-secret-name"] ==
                constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    ct_pod = get_ceph_tools_pod()

    # https://github.com/red-hat-storage/ocs-ci/issues/3820
    # Verify ceph osd tree output
    if not (config.DEPLOYMENT.get("ui_deployment")
            or config.DEPLOYMENT["external_mode"]):
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")
        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()]
            # removes duplicate hostname
            deviceset_pvcs = list(set(deviceset_pvcs))
            if config.ENV_DATA.get("platform") == constants.BAREMETAL_PLATFORM:
                deviceset_pvcs = [
                    deviceset.replace(".", "-") for deviceset in deviceset_pvcs
                ]
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if ocs_version < version.VERSION_4_6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()
        if config.DEPLOYMENT.get("kms_deployment"):
            kms = KMS.get_kms_deployment()
            kms.post_deploy_verification()

    storage_cluster_obj = get_storage_cluster()
    is_flexible_scaling = (
        storage_cluster_obj.get()["items"][0].get("spec").get(
            "flexibleScaling", False))
    if is_flexible_scaling is True:
        failure_domain = storage_cluster_obj.data["items"][0]["status"][
            "failureDomain"]
        assert failure_domain == "host", (
            f"The expected failure domain on cluster with flexible scaling is 'host',"
            f" the actaul failure domain is {failure_domain}")

    if ocs_version >= version.VERSION_4_7:
        log.info("Verifying images in storage cluster")
        verify_sc_images(storage_cluster)

    if config.ENV_DATA.get("is_multus_enabled"):
        verify_multus_network()
Ejemplo n.º 11
0
def test_upgrade():
    ceph_cluster = CephCluster()
    with CephHealthMonitor(ceph_cluster):
        namespace = config.ENV_DATA['cluster_namespace']
        version_before_upgrade = config.ENV_DATA.get("ocs_version")
        upgrade_version = config.UPGRADE.get("upgrade_ocs_version",
                                             version_before_upgrade)
        ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image')
        if ocs_registry_image:
            upgrade_version = get_ocs_version_from_image(ocs_registry_image)
        parsed_version_before_upgrade = parse_version(version_before_upgrade)
        parsed_upgrade_version = parse_version(upgrade_version)
        assert parsed_upgrade_version >= parsed_version_before_upgrade, (
            f"Version you would like to upgrade to: {upgrade_version} "
            f"is not higher or equal to the version you currently running: "
            f"{version_before_upgrade}")
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        channel = config.DEPLOYMENT.get('ocs_csv_channel')
        csv_name_pre_upgrade = package_manifest.get_current_csv(channel)
        log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
        csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade,
                              namespace=namespace)
        pre_upgrade_images = get_images(csv_pre_upgrade.get())
        version_change = parsed_upgrade_version > parsed_version_before_upgrade
        if version_change:
            version_config_file = os.path.join(constants.CONF_DIR,
                                               'ocs_version',
                                               f'ocs-{upgrade_version}.yaml')
            load_config_file(version_config_file)
        ocs_catalog = CatalogSource(
            resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME,
            namespace=constants.MARKETPLACE_NAMESPACE,
        )
        upgrade_in_current_source = config.UPGRADE.get(
            'upgrade_in_current_source', False)
        if not upgrade_in_current_source:
            if not ocs_catalog.is_exist() and not upgrade_in_current_source:
                log.info("OCS catalog source doesn't exist. Creating new one.")
                create_catalog_source(ocs_registry_image, ignore_upgrade=True)
            image_url = ocs_catalog.get_image_url()
            image_tag = ocs_catalog.get_image_name()
            log.info(f"Current image is: {image_url}, tag: {image_tag}")
            if ocs_registry_image:
                image_url, new_image_tag = ocs_registry_image.split(':')
            elif (config.UPGRADE.get('upgrade_to_latest', True)
                  or version_change):
                new_image_tag = get_latest_ds_olm_tag()
            else:
                new_image_tag = get_next_version_available_for_upgrade(
                    image_tag)
            cs_data = deepcopy(ocs_catalog.data)
            image_for_upgrade = ':'.join([image_url, new_image_tag])
            log.info(f"Image: {image_for_upgrade} will be used for upgrade.")
            cs_data['spec']['image'] = image_for_upgrade

            with NamedTemporaryFile() as cs_yaml:
                dump_data_to_temp_yaml(cs_data, cs_yaml.name)
                ocs_catalog.apply(cs_yaml.name)
        # Wait for the new package manifest for upgrade.
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        package_manifest.wait_for_resource()
        channel = config.DEPLOYMENT.get('ocs_csv_channel')
        if not channel:
            channel = package_manifest.get_default_channel()

        # update subscription
        subscription = OCP(
            resource_name=constants.OCS_SUBSCRIPTION,
            kind='subscription',
            namespace=config.ENV_DATA['cluster_namespace'],
        )
        current_ocs_source = subscription.data['spec']['source']
        log.info(f"Current OCS subscription source: {current_ocs_source}")
        ocs_source = current_ocs_source if upgrade_in_current_source else (
            constants.OPERATOR_CATALOG_SOURCE_NAME)
        patch_subscription_cmd = (
            f'oc patch subscription {constants.OCS_SUBSCRIPTION} '
            f'-n {namespace} --type merge -p \'{{"spec":{{"channel": '
            f'"{channel}", "source": "{ocs_source}"}}}}\'')
        run_cmd(patch_subscription_cmd)

        subscription_plan_approval = config.DEPLOYMENT.get(
            'subscription_plan_approval')
        if subscription_plan_approval == 'Manual':
            wait_for_install_plan_and_approve(namespace)
        attempts = 145
        for attempt in range(1, attempts + 1):
            log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.")
            csv_name_post_upgrade = package_manifest.get_current_csv(channel)
            if csv_name_post_upgrade == csv_name_pre_upgrade:
                log.info(f"CSV is still: {csv_name_post_upgrade}")
                sleep(5)
            else:
                log.info(f"CSV now upgraded to: {csv_name_post_upgrade}")
                break
            if attempts == attempt:
                raise TimeoutException("No new CSV found after upgrade!")
        csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade,
                               namespace=namespace)
        log.info(
            f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state"
        )
        if version_before_upgrade == '4.2' and upgrade_version == '4.3':
            log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3")
            setup_ceph_toolbox(force_setup=True)
        osd_count = get_osd_count()
        csv_post_upgrade.wait_for_phase("Succeeded", timeout=200 * osd_count)
        post_upgrade_images = get_images(csv_post_upgrade.get())
        old_images, _, _ = get_upgrade_image_info(pre_upgrade_images,
                                                  post_upgrade_images)
        verify_image_versions(old_images, parsed_upgrade_version)
        ocs_install_verification(
            timeout=600,
            skip_osd_distribution_check=True,
            ocs_registry_image=ocs_registry_image,
            post_upgrade_verification=True,
        )
Ejemplo n.º 12
0
def test_upgrade():
    ceph_cluster = CephCluster()
    with CephHealthMonitor(ceph_cluster):
        namespace = config.ENV_DATA['cluster_namespace']
        ocs_catalog = CatalogSource(
            resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME,
            namespace=constants.MARKETPLACE_NAMESPACE,
        )
        version_before_upgrade = config.ENV_DATA.get("ocs_version")
        upgrade_version = config.UPGRADE.get("upgrade_ocs_version",
                                             version_before_upgrade)
        parsed_version_before_upgrade = parse_version(version_before_upgrade)
        parsed_upgrade_version = parse_version(upgrade_version)
        assert parsed_upgrade_version >= parsed_version_before_upgrade, (
            f"Version you would like to upgrade to: {upgrade_version} "
            f"is not higher or equal to the version you currently running: "
            f"{version_before_upgrade}")
        version_change = parsed_upgrade_version > parsed_version_before_upgrade
        if version_change:
            version_config_file = os.path.join(constants.CONF_DIR,
                                               'ocs_version',
                                               f'ocs-{upgrade_version}.yaml')
            assert os.path.exists(version_config_file), (
                f"OCS version config file {version_config_file} doesn't exist!"
            )
            with open(os.path.abspath(
                    os.path.expanduser(version_config_file))) as file_stream:
                custom_config_data = yaml.safe_load(file_stream)
                config.update(custom_config_data)
        image_url = ocs_catalog.get_image_url()
        image_tag = ocs_catalog.get_image_name()
        log.info(f"Current image is: {image_url}, tag: {image_tag}")
        ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image')
        if ocs_registry_image:
            image_url, new_image_tag = ocs_registry_image.split(':')
        elif config.UPGRADE.get('upgrade_to_latest', True) or version_change:
            new_image_tag = get_latest_ds_olm_tag()
        else:
            new_image_tag = get_next_version_available_for_upgrade(image_tag)
        cs_data = deepcopy(ocs_catalog.data)
        image_for_upgrade = ':'.join([image_url, new_image_tag])
        log.info(f"Image: {image_for_upgrade} will be used for upgrade.")
        cs_data['spec']['image'] = image_for_upgrade
        operator_selector = get_selector_for_ocs_operator()
        package_manifest = PackageManifest(
            resource_name=OCS_OPERATOR_NAME,
            selector=operator_selector,
        )
        csv_name_pre_upgrade = package_manifest.get_current_csv()
        log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}")
        csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade,
                              namespace=namespace)
        pre_upgrade_images = get_images(csv_pre_upgrade.get())

        with NamedTemporaryFile() as cs_yaml:
            dump_data_to_temp_yaml(cs_data, cs_yaml.name)
            ocs_catalog.apply(cs_yaml.name)
        # Wait for package manifest is ready
        package_manifest.wait_for_resource()
        subscription_plan_approval = config.DEPLOYMENT.get(
            'subscription_plan_approval')
        if subscription_plan_approval == 'Manual':
            wait_for_install_plan_and_approve(namespace)
        attempts = 145
        for attempt in range(1, attempts):
            if attempts == attempt:
                raise TimeoutException("No new CSV found after upgrade!")
            log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.")
            package_manifest.reload_data()
            csv_name_post_upgrade = package_manifest.get_current_csv()
            if csv_name_post_upgrade == csv_name_pre_upgrade:
                log.info(f"CSV is still: {csv_name_post_upgrade}")
                sleep(5)
            else:
                log.info(f"CSV now upgraded to: {csv_name_post_upgrade}")
                break
        csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade,
                               namespace=namespace)
        log.info(
            f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state"
        )
        if version_before_upgrade == '4.2' and upgrade_version == '4.3':
            log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3")
            setup_ceph_toolbox(force_setup=True)
        csv_post_upgrade.wait_for_phase("Succeeded", timeout=600)
        post_upgrade_images = get_images(csv_post_upgrade.get())
        old_images, _, _ = get_upgrade_image_info(pre_upgrade_images,
                                                  post_upgrade_images)
        verify_image_versions(old_images, parsed_upgrade_version)
        ocs_install_verification(timeout=600, skip_osd_distribution_check=True)
Ejemplo n.º 13
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = config.ENV_DATA["ocs_version"]
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        ocs_version in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        #  RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4
        if (float(config.ENV_DATA["ocs_version"]) < 4.5
                or float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1
        else:
            rgw_count = 2

    # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore
    if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM:
        if float(config.ENV_DATA["ocs_version"]) == 4.4 or (
                float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if not config.ENV_DATA.get(
                    "platform") in constants.ON_PREM_PLATFORMS:
                continue
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL,
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})")

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] ==
        constants.CEPHFS_NODE_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
        == constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    if not config.DEPLOYMENT["external_mode"]:
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")

        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = get_compute_node_names()
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        ct_pod = get_ceph_tools_pod()
        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if float(config.ENV_DATA["ocs_version"]) < 4.6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()