Exemple #1
0
    def scan_cluster(self):
        """
        Get accurate info on current state of pods
        """
        self._ceph_pods = pod.get_all_pods(self._namespace)
        # TODO: Workaround for BZ1748325:
        mons = pod.get_mon_pods(self.mon_selector, self.namespace)
        for mon in mons:
            if mon.ocp.get_resource_status(
                    mon.name) == constant.STATUS_RUNNING:
                self.mons.append(mon)
        # TODO: End of workaround for BZ1748325
        self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace)
        self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace)
        self.osds = pod.get_osd_pods(self.osd_selector, self.namespace)
        self.noobaas = pod.get_noobaa_pods(self.noobaa_selector,
                                           self.namespace)
        self.toolbox = pod.get_ceph_tools_pod()

        # set port attrib on mon pods
        self.mons = list(map(self.set_port, self.mons))
        self.cluster.reload()
        if self.cephfs:
            self.cephfs.reload()
        else:
            try:
                self.cephfs_config = self.CEPHFS.get().get('items')[0]
                self.cephfs = ocs.OCS(**self.cephfs_config)
                self.cephfs.reload()
            except IndexError as e:
                logging.warning(e)
                logging.warning("No CephFS found")

        self.mon_count = len(self.mons)
        self.mds_count = len(self.mdss)
        self.mgr_count = len(self.mgrs)
        self.osd_count = len(self.osds)
        self.noobaa_count = len(self.noobaas)
Exemple #2
0
 def finalizer():
     must_gather_pods = pod.get_all_pods(
         selector_label='app=must-gather')
     logger.info(f"must_gather_pods: {must_gather_pods} ")
     sample_pods = TimeoutSampler(
         timeout=30,
         sleep=3,
         func=check_for_must_gather_pod,
     )
     sample_namespace = TimeoutSampler(
         timeout=30,
         sleep=3,
         func=check_for_must_gather_project,
     )
     if sample_pods.wait_for_func_status(result=True):
         for must_gather_pod in must_gather_pods:
             self.ocp_obj.wait_for_delete(resource_name=must_gather_pod)
             logger.info(f"deleted pods: {must_gather_pods}")
     if not sample_namespace.wait_for_func_status(result=False):
         must_gather_namespace = check_for_must_gather_project()
         logger.info(f"namespace to delete: {must_gather_namespace}")
         self.ocp_obj.wait_for_delete(
             resource_name=must_gather_namespace)
Exemple #3
0
def setup_persistent_monitoring():
    """
    Change monitoring backend to OCS
    """
    sc = helpers.default_storage_class(interface_type=constants.CEPHBLOCKPOOL)

    # Get the list of monitoring pods
    pods_list = get_all_pods(
        namespace=defaults.OCS_MONITORING_NAMESPACE,
        selector=["prometheus", "alertmanager"],
    )

    # Create configmap cluster-monitoring-config and reconfigure
    # storage class and telemeter server (if the url is specified in a
    # config file)
    create_configmap_cluster_monitoring_pod(
        sc_name=sc.name,
        telemeter_server_url=config.ENV_DATA.get("telemeter_server_url"),
    )

    # Take some time to respin the pod
    waiting_time = 45
    logger.info(f"Waiting {waiting_time} seconds...")
    time.sleep(waiting_time)

    # Validate the pods are respinned and in running state
    retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(
        validate_pods_are_respinned_and_running_state
    )(pods_list)

    # Validate the pvc is created on monitoring pods
    validate_pvc_created_and_bound_on_monitoring_pods()

    # Validate the pvc are mounted on pods
    retry((CommandFailed, AssertionError), tries=3, delay=15)(
        validate_pvc_are_mounted_on_monitoring_pods
    )(pods_list)
    def test_monitoring_after_restarting_prometheus_pod(self, pods):
        """
        Test case to validate prometheus pod restart
        should not have any functional impact

        """

        # Get the prometheus pod
        prometheus_pod_obj = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_object in prometheus_pod_obj:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_object.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Restart the prometheus pod
            pod_object.delete(force=True)
            pod_obj = ocp.OCP(kind=constants.POD,
                              namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert pod_obj.wait_for_resource(condition='Running',
                                             selector='app=prometheus',
                                             timeout=60)

            # Check the same pvc is mounted on new pod
            pod_info = pod_object.get()
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_object.name}"
                )

        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
def check_health_of_clusterlogging():
    """
    * Checks for ElasticSearch, curator, fluentd and kibana pods in
        openshift-logging namespace
    * And check for the health of cluster logging, If status is green then the
        cluster is healthy,if status is red then health is bad

    Returns:
        list: Gives all the pods that are present in the namespace

    """

    pod_list = []
    pods = get_all_pods(namespace='openshift-logging')
    logger.info("Pods that are created by the instance")
    for pod in pods:
        pod_list.append(pod.name)
    logger.info(pod_list)
    elasticsearch_pod = [
        pod for pod in pod_list if pod.startswith('elasticsearch')
    ]
    pod_obj = get_pod_obj(
        name=elasticsearch_pod[0], namespace='openshift-logging'
    )
    status_check = pod_obj.exec_cmd_on_pod(
        command='es_util --query=_cluster/health?pretty',
        out_yaml_format=False
    )
    logger.info(status_check)
    status_check = json.loads(status_check)
    if status_check['status'] == 'green':
        logger.info("Cluster logging is in Healthy state & Ready to use")
    else:
        raise UnexpectedBehaviour
        logger.error("Cluster logging is in Bad state")
    return pod_list
def respin_amq_app_pod(kafka_namespace, pod_pattern):
    """
    Respin amq pod

    Args:
        kafka_namespace (str): Namespace for kafka
        pod_pattern (str): The pattern for the pod

    """
    pod_obj = ocp.OCP(kind=constants.POD, namespace=kafka_namespace)
    pod_obj_list = get_all_pods(namespace=kafka_namespace)
    for pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, pod_pattern,
                              kafka_namespace):
        try:
            if pod is not None:
                pod_obj.delete(resource_name=pod[0])
                assert pod_obj.wait_for_resource(
                    condition='Running',
                    resource_count=len(pod_obj_list),
                    timeout=300)
                break
        except IndexError as ie:
            log.error(" pod doesn't exist")
            raise ie
    def test_disruptive_during_pod_pvc_deletion_and_io(
        self, interface, resource_to_delete, setup_base
    ):
        """
        Delete ceph/rook pod while PVCs deletion, pods deletion and IO are
        progressing
        """
        pvc_objs, pod_objs, rwx_pod_objs = setup_base
        namespace = pvc_objs[0].project.namespace

        num_of_pods_to_delete = 3
        num_of_io_pods = 1

        # Select pods to be deleted
        pods_to_delete = pod_objs[:num_of_pods_to_delete]
        pods_to_delete.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_to_delete
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods to run IO
        io_pods = pod_objs[
            num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods
        ]
        io_pods.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in io_pods
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods which are having PVCs to delete
        pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :]
        pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc]
        pods_for_pvc.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_for_pvc
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        log.info(
            f"{len(pods_to_delete)} pods selected for deletion in which "
            f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod "
            f"share same RWX PVC"
        )
        log.info(
            f"{len(io_pods)} pods selected for running IO in which "
            f"{len(io_pods) - num_of_io_pods} pairs of pod share same "
            f"RWX PVC"
        )
        no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete)
        log.info(
            f"{len(pvcs_to_delete)} PVCs selected for deletion. "
            f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, "
            f"RWX PVCs: {no_of_rwx_pvcs_delete}"
        )

        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs))

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        # Fetch PV names to verify after deletion
        pv_objs = []
        for pvc_obj in pvcs_to_delete:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in pods_to_delete:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Fetch image uuid associated with PVCs to be deleted
        pvc_uuid_map = {}
        for pvc_obj in pvcs_to_delete:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs + rwx_pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs + rwx_pod_objs:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on pods having PVCs to delete to load data
        log.info("Starting IO on pods having PVCs to delete.")
        self.run_io_on_pods(pods_for_pvc)
        log.info("IO started on pods having PVCs to delete.")

        log.info("Fetching IO results from the pods having PVCs to delete.")
        for pod_obj in pods_for_pvc:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods having PVCs to delete.")

        # Delete pods having PVCs to delete.
        delete_pods(pods_for_pvc)
        for pod_obj in pods_for_pvc:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Deleted pods which are having PVCs to delete.")

        # Start IO on pods to be deleted
        log.info("Starting IO on pods to be deleted.")
        self.run_io_on_pods(pods_to_delete)
        log.info("IO started on pods to be deleted.")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete)
        log.info("Started deleting PVCs")

        # Start deleting pods
        pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False)
        log.info("Started deleting pods")

        # Start IO on IO pods
        self.run_io_on_pods(io_pods)
        log.info("Started IO on IO pods")

        # Verify pvc deletion has started
        pvc_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pvcs,
            previous_num=initial_num_of_pvc,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        # Verify pod deletion has started
        pod_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pods,
            previous_num=initial_num_of_pods,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted."
        log.info("PVCs deletion has started.")

        assert pod_deleting.result(), "Wait timeout: Pods are not being deleted."
        log.info("Pods deletion has started.")

        # Delete pod of type 'resource_to_delete'
        disruption.delete_resource()

        pod_bulk_delete.result()

        # Verify pods are deleted
        for pod_obj in pods_to_delete:
            pod_obj.ocp.wait_for_delete(pod_obj.name, 300)
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting " "the pods"
        )

        pvcs_deleted = pvc_bulk_delete.result()
        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvcs_to_delete:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300)
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid, pool_name=pool_name
                )
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid
                )
            assert ret, (
                f"Volume associated with PVC {pvc_name} still exists " f"in backend"
            )

        log.info("Fetching IO results from the pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        log.info("Verified IO result on pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
Exemple #8
0
    def test_pvc_disruptive(self, storageclass, namespace, interface,
                            operation_to_disrupt, resource_to_delete,
                            teardown_factory):
        """
        Base function for PVC disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)['items'])

        executor = ThreadPoolExecutor(max_workers=1)

        DISRUPTION_OPS.set_resource(resource=resource_to_delete)

        # Start creation of multiple PVCs. Create 5 PVCs
        bulk_pvc_create = executor.submit(helpers.create_multiple_pvcs,
                                          sc_name=storageclass.name,
                                          namespace=namespace,
                                          number_of_pvc=5)

        if operation_to_disrupt == 'create_pvc':
            # Ensure PVCs are being created before deleting the resource
            ret = self.verify_resource_creation(get_all_pvcs,
                                                initial_num_of_pvc, namespace)
            assert ret, "Wait timeout: PVCs are not being created."
            logging.info(f"PVCs creation has started.")
            DISRUPTION_OPS.delete_resource()

        pvc_objs = bulk_pvc_create.result()

        for pvc_obj in pvc_objs:
            teardown_factory(pvc_obj)

        # Verify PVCs are Bound
        for pvc_obj in pvc_objs:
            assert pvc_obj.ocp.wait_for_resource(
                condition=constants.STATUS_BOUND,
                resource_name=pvc_obj.name,
                timeout=120
            ), (f"Wait timeout: PVC {pvc_obj.name} is not in 'Bound' status "
                f"even after 120 seconds.")
        logging.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(helpers.create_pods,
                                          pvc_objs_list=pvc_objs,
                                          interface_type=interface,
                                          namespace=namespace)

        if operation_to_disrupt == 'create_pod':
            # Ensure that pods are being created before deleting the resource
            ret = self.verify_resource_creation(pod.get_all_pods,
                                                initial_num_of_pods, namespace)
            assert ret, "Wait timeout: Pods are not being created."
            logging.info(f"Pods creation has started.")
            DISRUPTION_OPS.delete_resource()

        pod_objs = bulk_pod_create.result()

        for pod_obj in pod_objs:
            teardown_factory(pod_obj)

        # Verify pods are Running
        for pod_obj in pod_objs:
            assert pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=120), (
                    f"Wait timeout: Pod {pod_obj.name} is not in 'Running' "
                    f"state even after 120 seconds.")
        logging.info("Verified: All pods are Running.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pod_obj.run_io(storage_type='fs',
                           size='1G',
                           runtime=10,
                           fio_filename='fio-file1')
        logging.info("FIO started on all pods.")

        if operation_to_disrupt == 'run_io':
            DISRUPTION_OPS.delete_resource()

        logging.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            logging.info(f"IOPs after FIO on pod {pod_obj.name}:")
            logging.info(
                f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}")
            logging.info(
                f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}")
        logging.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)

        # Verify that PVCs are reusable by creating new pods
        create_pods = executor.submit(helpers.create_pods,
                                      pvc_objs_list=pvc_objs,
                                      interface_type=interface,
                                      namespace=namespace)
        pod_objs = create_pods.result()

        for pod_obj in pod_objs:
            teardown_factory(pod_obj)

        # Verify new pods are Running
        for pod_obj in pod_objs:
            assert pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=120), (
                    f"Wait timeout: Pod {pod_obj.name} is not in 'Running' "
                    f"state even after 120 seconds.")
        logging.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pod_obj.run_io(storage_type='fs',
                           size='1G',
                           runtime=10,
                           fio_filename='fio-file2')

        logging.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            logging.info(f"IOPs after FIO on pod {pod_obj.name}:")
            logging.info(
                f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}")
            logging.info(
                f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}")
        logging.info("Verified FIO result on new pods.")
Exemple #9
0
    def create_scale_pods(
        self,
        scale_count=1500,
        pods_per_iter=5,
        io_runtime=None,
        pvc_size=None,
        start_io=None,
    ):
        """
        Main Function with scale pod creation flow and checks to add nodes.
        For other platforms will not be considering the instance_type param

        Args:
            scale_count (int): Scale pod+pvc count
            io_runtime (sec): Fio run time in seconds
            start_io (bool): If True start IO else don't
            pods_per_iter (int): Number of PVC-POD to be created per PVC type
            pvc_size (Gi): size of PVC
            Example, If 5 then 20 PVC+POD will be created with 5 each of 4 PVC types
            Test value in-between 5-10

        """
        self.ms_name, all_pod_obj = ([] for i in range(2))
        if not 5 <= pods_per_iter <= 10:
            raise UnexpectedBehaviour(
                "Pods_per_iter value should be in-between 5-15")

        # Check for expected worker count
        expected_worker_count = get_expected_worker_count(scale_count)
        if check_and_add_enough_worker(expected_worker_count):
            if (config.ENV_DATA["deployment_type"] == "ipi"
                    and config.ENV_DATA["platform"].lower() == "aws"):
                for obj in machine.get_machineset_objs():
                    if "app" in obj.name:
                        self.ms_name.append(obj.name)
            else:
                self.ms_name = []

        # Create namespace
        self.create_and_set_namespace()

        # Continue to iterate till the scale pvc limit is reached
        while True:
            if scale_count <= len(all_pod_obj):
                logger.info(f"Scaled {scale_count} pvc and pods")

                if cluster.validate_pg_balancer():
                    logging.info(
                        "OSD consumption and PG distribution is good to continue"
                    )
                else:
                    raise UnexpectedBehaviour(
                        "Unequal PG distribution to OSDs")

                break
            else:
                logger.info(f"Scaled PVC and POD count {len(all_pod_obj)}")
                self.pod_obj, self.pvc_obj = self.create_multi_pvc_pod(
                    pods_per_iter, io_runtime, start_io, pvc_size)
                all_pod_obj.extend(self.pod_obj)
                try:
                    # Check enough resources available in the dedicated app workers
                    check_enough_resource_available_in_workers(
                        self.ms_name, self.pod_dict_path)

                    # Check for ceph cluster OSD utilization
                    if not cluster.validate_osd_utilization(osd_used=75):
                        logging.info("Cluster OSD utilization is below 75%")
                    elif not cluster.validate_osd_utilization(osd_used=83):
                        logger.warning("Cluster OSD utilization is above 75%")
                    else:
                        raise CephHealthException("Cluster OSDs are near full")

                    # Check for 500 pods per namespace
                    pod_objs = pod.get_all_pods(
                        namespace=self.namespace_list[-1].namespace)
                    if len(pod_objs) >= 500:
                        self.create_and_set_namespace()

                except UnexpectedBehaviour:
                    logging.error(
                        f"Scaling of cluster failed after {len(all_pod_obj)} pod creation"
                    )
                    raise UnexpectedBehaviour(
                        "Scaling PVC+POD failed analyze setup and log for more details"
                    )
Exemple #10
0
def uninstall_cluster_logging():
    """
    Function to uninstall cluster-logging from the cluster
    Deletes the project "openshift-logging" and "openshift-operators-redhat"
    """
    # Validating the pods before deleting the instance
    pod_list = get_all_pods(namespace=constants.OPENSHIFT_LOGGING_NAMESPACE)

    for pod in pod_list:
        logger.info(f"Pods running in the openshift-logging namespace {pod.name}")

    # Excluding cluster-logging-operator from pod_list and getting pod names
    pod_names_list = [
        pod.name
        for pod in pod_list
        if not pod.name.startswith("cluster-logging-operator")
    ]

    # Deleting the clusterlogging instance
    clusterlogging_obj = ocp.OCP(
        kind=constants.CLUSTER_LOGGING, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE
    )
    assert clusterlogging_obj.delete(resource_name="instance")

    check_pod_vanished(pod_names_list)

    # Deleting the PVCs
    pvc_obj = ocp.OCP(
        kind=constants.PVC, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE
    )
    pvc_list = get_all_pvcs(namespace=constants.OPENSHIFT_LOGGING_NAMESPACE)
    for pvc in range(len(pvc_list) - 1):
        pvc_obj.delete(resource_name=pvc_list["items"][pvc]["metadata"]["name"])
        pvc_obj.wait_for_delete(
            resource_name=pvc_list["items"][pvc]["metadata"]["name"]
        )

    # Deleting the RBAC permission set
    rbac_role = ocp.OCP(
        kind=constants.ROLE, namespace=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE
    )
    rbac_role.delete(yaml_file=constants.EO_RBAC_YAML)

    # Deleting the projects
    openshift_logging_namespace = ocp.OCP(
        kind=constants.NAMESPACES, namespace=constants.OPENSHIFT_LOGGING_NAMESPACE
    )
    openshift_operators_redhat_namespace = ocp.OCP(
        kind=constants.NAMESPACES,
        namespace=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE,
    )

    if openshift_logging_namespace.get():
        assert openshift_logging_namespace.delete(
            resource_name=constants.OPENSHIFT_LOGGING_NAMESPACE
        )
        logger.info("The namespace openshift-logging got deleted successfully")
    if openshift_operators_redhat_namespace.get():
        assert openshift_operators_redhat_namespace.delete(
            resource_name=constants.OPENSHIFT_OPERATORS_REDHAT_NAMESPACE
        )
        logger.info("The project openshift-opertors-redhat got deleted successfully")
def ocs_install_verification(
    timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None,
    post_upgrade_verification=False,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data['spec']['version']
    ocs_version = config.ENV_DATA['ocs_version']
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}"
    )
    assert ocs_version in csv_version, (
        f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    )
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        'ocs_registry_image'
    )
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}"
        )
        ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch')
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV"
            )
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}"
            )

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(
        f"Check if StorageCluster: {storage_cluster_name} is in"
        f"Succeeded phase"
    )
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(
        kind=constants.POD, namespace=namespace
    )
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count'])
        * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])
    )

    # check noobaa CR for min number of noobaa endpoint pods
    nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount')
    max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount')

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.MON_APP_LABEL: 3,
        constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
        constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
        constants.OSD_APP_LABEL: osd_count,
        constants.MGR_APP_LABEL: 1,
        constants.MDS_APP_LABEL: 2,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps
    }
    if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS:
        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1
        # post upgrade to OCS 4.5. Tracked with
        # https://github.com/red-hat-storage/ocs-ci/issues/2532
        rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not (
            post_upgrade_verification
        ) else 1
        resources_dict.update({constants.RGW_APP_LABEL: rgw_count})
    for label, count in resources_dict.items():
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
    )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(
        kind=constants.STORAGECLASS, namespace=namespace
    )
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs',
        f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name'] for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSDs are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        deviceset_count = get_deviceset_count()
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > deviceset_count, (
                "OSD's are not distributed evenly across worker nodes"
            )

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == (
        {item['metadata']['name'] for item in csi_driver.get()['items']}
    )

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD
    )
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS
    )
    assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output."
    )

    if (
        config.DEPLOYMENT.get('local_storage')
        and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM
    ):
        deviceset_pvcs = get_compute_node_names()
    else:
        deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}"
    )
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output."
    )

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ]
    )
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and ('snapshot' not in image), (
                f"Snapshot container is present in {pod_obj.name} pod. "
                f"Container {container}. Image {image}"
            )
    deployments = ocs_csv.get()['spec']['install']['spec']['deployments']
    rook_ceph_operator_deployment = [
        deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator'
    ]
    assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in (
        rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env']
    ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(
            ceph_cmd='ceph osd crush dump', format=''
        )
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps'] if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(
        namespace, health_check_tries, health_check_delay
    )
Exemple #12
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")
        self.deploy_ocs_via_operator()
        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # validate PDB creation of MON, MDS, OSD pods
        validate_pdb_creation()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1847098
        if config.DEPLOYMENT.get('local_storage'):
            tools_pod = run_cmd(
                f"oc -n {self.namespace} get pod -l 'app=rook-ceph-tools' "
                f"-o jsonpath='{{.items[0].metadata.name}}'")
            pgs_to_autoscale = [
                'ocs-storagecluster-cephblockpool',
                'ocs-storagecluster-cephfilesystem-data0'
            ]
            for pg in pgs_to_autoscale:
                run_cmd(f"oc -n {self.namespace} exec {tools_pod} -- "
                        f"ceph osd pool set {pg} pg_autoscale_mode on")

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info("MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error("MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):

            sc = helpers.default_storage_class(
                interface_type=constants.CEPHBLOCKPOOL)

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config and reconfigure
            # storage class and telemeter server (if the url is specified in a
            # config file)
            create_configmap_cluster_monitoring_pod(
                sc_name=sc.name,
                telemeter_server_url=config.ENV_DATA.get(
                    "telemeter_server_url"))

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=3,
                  delay=15)(validate_pods_are_respinned_and_running_state)(
                      pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            retry((CommandFailed, AssertionError), tries=3, delay=15)(
                validate_pvc_are_mounted_on_monitoring_pods)(pods_list)
        elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                "telemeter_server_url"):
            # Create configmap cluster-monitoring-config to reconfigure
            # telemeter server url when 'persistent-monitoring' is False
            create_configmap_cluster_monitoring_pod(
                telemeter_server_url=config.ENV_DATA["telemeter_server_url"])

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=self.namespace)
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")
                    else:
                        raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def base_setup(self, interface, pvc_factory, pod_factory):
        """
        A setup phase for the test:
        get all the ceph pods information,
        create maxsize pvc, pod and run IO

        """
        # Setting the io_size_gb to 40% of the total PVC capacity
        ceph_pod = Pod.get_ceph_tools_pod()
        external = config.DEPLOYMENT["external_mode"]
        if external:
            ocp_obj = ocp.OCP()
            if interface == constants.CEPHBLOCKPOOL:
                resource_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD
            elif interface == constants.CEPHFILESYSTEM:
                resource_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS
            cmd = f"get sc {resource_name} -o yaml"
            pool_data = ocp_obj.exec_oc_cmd(cmd)
            pool = pool_data["parameters"]["pool"]

        else:
            pool = (constants.DEFAULT_BLOCKPOOL if interface
                    == constants.CEPHBLOCKPOOL else constants.DATA_POOL)

        ceph_replica = ceph_pod.exec_ceph_cmd(
            ceph_cmd=f"ceph osd pool get {pool} size")
        replica = ceph_replica["size"]
        ceph_status = ceph_pod.exec_ceph_cmd(ceph_cmd="ceph df")
        ceph_capacity = (int(ceph_status["stats"]["total_bytes"]) / replica /
                         constants.GB)
        pvc_size_gb = int(ceph_capacity * 0.5)
        io_size_gb = int(pvc_size_gb * 0.4)
        io_size_gb = 400 if io_size_gb >= 400 else io_size_gb

        pod_objs = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                "noobaa", "rook-ceph-osd-prepare", "rook-ceph-drain-canary"
            ],
            exclude_selector=True,
        )

        # Create maxsize pvc, app pod and run ios
        self.sc = default_storage_class(interface_type=interface)

        self.pvc_obj = pvc_factory(
            interface=interface,
            storageclass=self.sc,
            size=pvc_size_gb,
        )

        self.pod_obj = pod_factory(interface=interface, pvc=self.pvc_obj)

        log.info(f"Running FIO to fill PVC size: {io_size_gb}G")
        self.pod_obj.run_io("fs",
                            size=f"{io_size_gb}G",
                            io_direction="write",
                            runtime=480)

        log.info("Waiting for IO results")
        self.pod_obj.get_fio_results()

        return pod_objs
    def disruptive_base(self, interface, operation_to_disrupt,
                        resource_to_delete):
        """
        Base function for disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=1)

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=self.namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=self.namespace)["items"])

        # Fetch PV names
        pv_objs = []
        for pvc_obj in self.pvc_objs:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in self.pod_objs:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"]
            for pvc_obj in self.pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in self.pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                pod_obj.pvc.storage_type = "block"
            else:
                pod_obj.pvc.storage_type = "fs"
            pod_obj.workload_setup(storage_type=pod_obj.pvc.storage_type)
        log.info("Setup for running IO is completed on pods")

        # Start IO on each pod. RWX PVC will be used on two pods. So split the
        # size accordingly
        log.info("Starting IO on pods")
        for pod_obj in self.pod_objs:
            if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX:
                io_size = int((self.pvc_size - 1) / 2)
            else:
                io_size = self.pvc_size - 1
            pod_obj.run_io(
                storage_type=pod_obj.pvc.storage_type,
                size=f"{io_size}G",
                fio_filename=f"{pod_obj.name}_io",
                end_fsync=1,
            )
        log.info("IO started on all pods.")

        # Start deleting pods
        pod_bulk_delete = executor.submit(delete_pods,
                                          self.pod_objs,
                                          wait=False)

        if operation_to_disrupt == "delete_pods":
            ret = wait_for_resource_count_change(
                get_all_pods,
                initial_num_of_pods,
                self.namespace,
                "decrease",
                timeout=50,
            )
            assert ret, "Wait timeout: Pods are not being deleted."
            log.info("Pods deletion has started.")
            disruption.delete_resource()

        pod_bulk_delete.result()

        # Verify pods are deleted
        for pod_obj in self.pod_objs:
            assert pod_obj.ocp.wait_for_delete(
                pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted"
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        for node, pvs in node_pv_dict.items():
            cmd = f"oc debug nodes/{node} -- df"
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods")

        # Fetch image uuid associated with PVCs
        pvc_uuid_map = {}
        for pvc_obj in self.pvc_objs:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs)

        if operation_to_disrupt == "delete_pvcs":
            ret = wait_for_resource_count_change(get_all_pvcs,
                                                 initial_num_of_pvc,
                                                 self.namespace,
                                                 "decrease",
                                                 timeout=50)
            assert ret, "Wait timeout: PVCs are not being deleted."
            log.info("PVCs deletion has started.")
            disruption.delete_resource()

        pvcs_deleted = pvc_bulk_delete.result()

        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in self.pvc_objs:
            assert pvc_obj.ocp.wait_for_delete(
                pvc_obj.name), f"PVC {pvc_obj.name} is not deleted"
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            assert pv_obj.ocp.wait_for_delete(
                pv_obj.name, 120), f"PV {pv_obj.name} is not deleted"
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid,
                                                       pool_name=pool_name)
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid)
            assert ret, (f"Volume associated with PVC {pvc_name} still exists "
                         f"in backend")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
Exemple #16
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace)
        try:
            ceph_cluster.get().get("items")[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        if config.DEPLOYMENT["external_mode"]:
            logger.info("Deploying OCS on external mode RHCS")
            return self.deploy_with_external_mode()
        self.deploy_ocs_via_operator()
        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod.wait_for_resource(condition="Running",
                                     selector="app=rook-ceph-mgr",
                                     timeout=600)
        assert pod.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # validate PDB creation of MON, MDS, OSD pods
        validate_pdb_creation()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-tools",
            resource_count=1,
            timeout=600,
        )

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data["items"][0]["metadata"]["name"]

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info("MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error("MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get(
                "persistent-monitoring"):

            sc = helpers.default_storage_class(
                interface_type=constants.CEPHBLOCKPOOL)

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=["prometheus", "alertmanager"],
            )

            # Create configmap cluster-monitoring-config and reconfigure
            # storage class and telemeter server (if the url is specified in a
            # config file)
            create_configmap_cluster_monitoring_pod(
                sc_name=sc.name,
                telemeter_server_url=config.ENV_DATA.get(
                    "telemeter_server_url"),
            )

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=3,
                  delay=15)(validate_pods_are_respinned_and_running_state)(
                      pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            retry((CommandFailed, AssertionError), tries=3, delay=15)(
                validate_pvc_are_mounted_on_monitoring_pods)(pods_list)
        elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get(
                "telemeter_server_url"):
            # Create configmap cluster-monitoring-config to reconfigure
            # telemeter server url when 'persistent-monitoring' is False
            create_configmap_cluster_monitoring_pod(
                telemeter_server_url=config.ENV_DATA["telemeter_server_url"])

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        try:
            ceph_health_check(namespace=self.namespace, tries=30, delay=10)
        except CephHealthException as ex:
            err = str(ex)
            logger.warning(f"Ceph health check failed with {err}")
            if "clock skew detected" in err:
                logger.info(f"Changing NTP on compute nodes to"
                            f" {constants.RH_NTP_CLOCK}")
                if self.platform == constants.VSPHERE_PLATFORM:
                    update_ntp_compute_nodes()
                assert ceph_health_check(namespace=self.namespace,
                                         tries=60,
                                         delay=10)

        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
Exemple #17
0
def create_pod(interface_type=None,
               pvc_name=None,
               do_reload=True,
               namespace=defaults.ROOK_CLUSTER_NAMESPACE,
               node_name=None,
               pod_dict_path=None,
               sa_name=None,
               dc_deployment=False,
               raw_block_pv=False,
               raw_block_device=constants.RAW_BLOCK_DEVICE,
               replica_count=1,
               pod_name=None):
    """
    Create a pod

    Args:
        interface_type (str): The interface type (CephFS, RBD, etc.)
        pvc_name (str): The PVC that should be attached to the newly created pod
        do_reload (bool): True for reloading the object after creation, False otherwise
        namespace (str): The namespace for the new resource creation
        node_name (str): The name of specific node to schedule the pod
        pod_dict_path (str): YAML path for the pod
        sa_name (str): Serviceaccount name
        dc_deployment (bool): True if creating pod as deploymentconfig
        raw_block_pv (bool): True for creating raw block pv based pod, False otherwise
        raw_block_device (str): raw block device for the pod
        replica_count (int): Replica count for deployment config
        pod_name (str): Name of the pod to create

    Returns:
        Pod: A Pod instance

    Raises:
        AssertionError: In case of any failure
    """
    if interface_type == constants.CEPHBLOCKPOOL:
        pod_dict = pod_dict_path if pod_dict_path else constants.CSI_RBD_POD_YAML
        interface = constants.RBD_INTERFACE
    else:
        pod_dict = pod_dict_path if pod_dict_path else constants.CSI_CEPHFS_POD_YAML
        interface = constants.CEPHFS_INTERFACE
    if dc_deployment:
        pod_dict = pod_dict_path if pod_dict_path else constants.FEDORA_DC_YAML
    pod_data = templating.load_yaml(pod_dict)
    if not pod_name:
        pod_name = create_unique_resource_name(f'test-{interface}', 'pod')
    pod_data['metadata']['name'] = pod_name
    pod_data['metadata']['namespace'] = namespace
    if dc_deployment:
        pod_data['metadata']['labels']['app'] = pod_name
        pod_data['spec']['template']['metadata']['labels']['name'] = pod_name
        pod_data['spec']['replicas'] = replica_count

    if pvc_name:
        if dc_deployment:
            pod_data['spec']['template']['spec']['volumes'][0][
                'persistentVolumeClaim']['claimName'] = pvc_name
        else:
            pod_data['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] = pvc_name

    if interface_type == constants.CEPHBLOCKPOOL and raw_block_pv:
        pod_data['spec']['containers'][0]['volumeDevices'][0][
            'devicePath'] = raw_block_device
        pod_data['spec']['containers'][0]['volumeDevices'][0][
            'name'] = pod_data.get('spec').get('volumes')[0].get('name')

    if node_name:
        pod_data['spec']['nodeName'] = node_name
    else:
        if 'nodeName' in pod_data.get('spec'):
            del pod_data['spec']['nodeName']
    if sa_name and dc_deployment:
        pod_data['spec']['template']['spec']['serviceAccountName'] = sa_name
    if dc_deployment:
        ocs_obj = create_resource(**pod_data)
        logger.info(ocs_obj.name)
        assert (ocp.OCP(kind='pod', namespace=namespace)).wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=pod_name + '-1-deploy',
            resource_count=0,
            timeout=180,
            sleep=3)
        dpod_list = pod.get_all_pods(namespace=namespace)
        for dpod in dpod_list:
            if '-1-deploy' not in dpod.name:
                if pod_name in dpod.name:
                    return dpod
    else:
        pod_obj = pod.Pod(**pod_data)
        pod_name = pod_data.get('metadata').get('name')
        logger.info(f'Creating new Pod {pod_name} for test')
        created_resource = pod_obj.create(do_reload=do_reload)
        assert created_resource, (f"Failed to create Pod {pod_name}")

        return pod_obj
Exemple #18
0
    def test_pvc_disruptive(
        self,
        interface,
        operation_to_disrupt,
        resource_to_delete,
        multi_pvc_factory,
        pod_factory,
    ):
        """
        Base function for PVC disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            "mds":
            partial(pod.get_mds_pods),
            "mon":
            partial(pod.get_mon_pods),
            "mgr":
            partial(pod.get_mgr_pods),
            "osd":
            partial(pod.get_osd_pods),
            "rbdplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(pod.get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner":
            partial(pod.get_rbdfsplugin_provisioner_pods),
            "operator":
            partial(pod.get_operator_pods),
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        DISRUPTION_OPS.set_resource(resource=resource_to_delete)

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)
            num_of_pvc = 8
            access_mode_dist_ratio = [6, 2]

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend([
                f"{constants.ACCESS_MODE_RWO}-Block",
                f"{constants.ACCESS_MODE_RWX}-Block",
            ])
            num_of_pvc = 9
            access_mode_dist_ratio = [4, 3, 2]

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=self.proj_obj,
            size=5,
            access_modes=access_modes,
            access_modes_selection="distribute_random",
            access_mode_dist_ratio=access_mode_dist_ratio,
            status=constants.STATUS_BOUND,
            num_of_pvc=num_of_pvc,
            wait_each=False,
            timeout=90,
        )

        if operation_to_disrupt == "create_pvc":
            # Ensure PVCs are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                get_all_pvcs, initial_num_of_pvc, namespace, "increase")
            assert ret, "Wait timeout: PVCs are not being created."
            logger.info("PVCs creation has started.")
            DISRUPTION_OPS.delete_resource()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)
            pvc_obj.reload()
        logger.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(
            helpers.create_pods,
            pvc_objs,
            pod_factory,
            interface,
            2,
            nodes=node.get_worker_nodes(),
        )

        if operation_to_disrupt == "create_pod":
            # Ensure that pods are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                pod.get_all_pods, initial_num_of_pods, namespace, "increase")
            assert ret, "Wait timeout: Pods are not being created."
            logger.info("Pods creation has started.")
            DISRUPTION_OPS.delete_resource()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=90)
            pod_obj.reload()
        logger.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        logger.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            logger.info(
                f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(360, 2, getattr, pod_obj,
                                         "wl_setup_done"):
                if sample:
                    logger.info(f"Setup for running IO is completed on pod "
                                f"{pod_obj.name}.")
                    break
        logger.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file1",
            )
        logger.info("FIO started on all pods.")

        if operation_to_disrupt == "run_io":
            DISRUPTION_OPS.delete_resource()

        logger.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        logger.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        pod_objs = helpers.create_pods(
            pvc_objs,
            pod_factory,
            interface,
            2,
            nodes=node.get_worker_nodes(),
        )

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=90)
            pod_obj.reload()
        logger.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file2",
            )

        logger.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        logger.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        logger.info("Ceph cluster health is OK")
Exemple #19
0
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, test_fixture):
        """
        Test case to validate when the prometheus pod is down and
        interaction with prometheus
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        aws_obj = aws.AWS()

        # Get all the openshift-monitoring pods
        monitoring_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE)

        # Get the worker node list
        workers = get_typed_nodes(node_type='worker')

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            prometheus_node = [
                node for node in workers
                if node.get().get('metadata').get('name') == prometheus_node
            ]

            # Make one of the node down where the prometheus pod is hosted
            instances = aws.get_instances_ids_and_names(prometheus_node)
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the monitoring pods are up
        for pod_obj in monitoring_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING)

        # Check for the created pvc metrics after nodes restarting
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after restarting nodes
        namespaces = helpers.create_multilpe_projects(number_of_project=1)
        namespace_list.extend(namespaces)

        # Create pvcs after restarting nodes
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after restarting nodes
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod after restarting nodes
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )
    def test_monitoring_delete_pvc(self):
        """
        Test case to validate whether delete pvcs+configmap and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get 'cluster-monitoring-config' configmap
        ocp_configmap = ocp.OCP(namespace=constants.MONITORING_NAMESPACE,
                                kind='configmap')
        configmap_dict = ocp_configmap.get(
            resource_name='cluster-monitoring-config')
        dir_configmap = tempfile.mkdtemp(prefix='configmap_')
        yaml_file = f'{dir_configmap}/configmap.yaml'
        templating.dump_data_to_temp_yaml(configmap_dict, yaml_file)

        # Get prometheus and alertmanager pods
        prometheus_alertmanager_pods = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus', 'alertmanager'])

        # Get all pvc on monitoring namespace
        pvc_objs_list = pvc.get_all_pvc_objs(
            namespace=constants.MONITORING_NAMESPACE)

        # Delete configmap
        ocp_configmap.delete(resource_name='cluster-monitoring-config')

        # Delete all pvcs on monitoring namespace
        pvc.delete_pvcs(pvc_objs=pvc_objs_list)

        # Check all the prometheus and alertmanager pods are up
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Create configmap
        ocp_configmap.create(yaml_file=dir_configmap)

        # Check all the PVCs are up
        for pvc_obj in pvc_objs_list:
            wait_for_resource_state(resource=pvc_obj,
                                    state=constants.STATUS_BOUND,
                                    timeout=180)

        # Check all the prometheus and alertmanager pods are up
        # and pvc are mounted on monitoring pods
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)
            mount_point = pod_obj.exec_cmd_on_pod(
                command="df -kh",
                out_yaml_format=False,
            )
            assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}"
        log.info("Verified all pvc are mounted on monitoring pods")

        # Validate the prometheus health is ok
        assert prometheus_health_check(), (
            "Prometheus cluster health is not OK")
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Exemple #22
0
    def test_must_gather(self):
        """
        Tests functionality of: oc adm must-gather

        """
        # Fetch pod details
        pods = pod.get_all_pods(namespace='openshift-storage')
        pods = [each.name for each in pods]

        # Make logs root directory
        logger.info("Creating logs Directory")
        directory = self.make_directory()
        logger.info(f"Creating {directory}_ocs_logs - Done!")

        # Collect OCS logs
        logger.info("Collecting Logs")
        collect_ocs_logs(dir_name=directory, ocp=False)
        logger.info("Collecting logs - Done!")

        # Compare running pods list to "/pods" subdirectories
        must_gather_helper = re.compile(r'must-gather-.*.-helper')
        logger.info("Checking logs tree")
        logs = [
            logs for logs in self.get_log_directories(directory)
            if not (must_gather_helper.match(logs))
        ]
        logger.info(f"Logs: {logs}")
        logger.info(f"pods list: {pods}")
        assert set(sorted(logs)) == set(sorted(pods)), (
            f"List of openshift-storage pods are not equal to list of logs "
            f"directories list of pods: {pods} list of log directories: {logs}"
        )

        # 2nd test: Verify logs file are not empty
        logs_dir_list = self.search_log_files(directory)
        assert self.check_file_size(logs_dir_list), (
            "One or more log file are empty")

        # Find must_gather_commands directory for verification
        for dir_root, dirs, files in os.walk(directory + "_ocs_logs"):
            if os.path.basename(dir_root) == 'must_gather_commands':
                logger.info(
                    f"Found must_gather_commands directory - {dir_root}")
                assert 'json_output' in dirs, (
                    "json_output directory is not present in "
                    "must_gather_commands directory.")
                assert files, (
                    "No files present in must_gather_commands directory.")
                cmd_files_path = [
                    os.path.join(dir_root, file_name) for file_name in files
                ]
                json_output_dir = os.path.join(dir_root, 'json_output')
                break

        # Verify that command output files are present as expected
        assert sorted(constants.MUST_GATHER_COMMANDS) == sorted(files), (
            f"Actual and expected commands output files are not matching.\n"
            f"Actual: {files}\nExpected: {constants.MUST_GATHER_COMMANDS}")

        # Verify that files for command output in json are present as expected
        commands_json = os.listdir(json_output_dir)
        assert sorted(
            constants.MUST_GATHER_COMMANDS_JSON) == sorted(commands_json), (
                f"Actual and expected json output commands files are not "
                f"matching.\nActual: {commands_json}\n"
                f"Expected: {constants.MUST_GATHER_COMMANDS_JSON}")

        # Verify that command output files are not empty
        empty_files = []
        json_cmd_files_path = [
            os.path.join(json_output_dir, file_name)
            for file_name in commands_json
        ]
        for file_path in cmd_files_path + json_cmd_files_path:
            if not os.path.getsize(file_path) > 0:
                empty_files.append(file_path)
        assert not empty_files, f"These files are empty: {empty_files}"
Exemple #23
0
    def test_ceph_daemon_kill_during_pod_pvc_deletion(self, interface,
                                                      operation_to_disrupt,
                                                      resource_name,
                                                      setup_base):
        """
        Kill 'resource_name' daemon while deletion of PVCs/pods is progressing
        """
        pvc_objs, self.pod_objs = setup_base
        sc_obj = pvc_objs[0].storageclass
        self.namespace = pvc_objs[0].project.namespace
        pod_functions = {
            'mds': partial(get_mds_pods),
            'mon': partial(get_mon_pods),
            'mgr': partial(get_mgr_pods),
            'osd': partial(get_osd_pods),
            'rbdplugin': partial(get_plugin_pods, interface=interface),
            'cephfsplugin': partial(get_plugin_pods, interface=interface),
            'cephfsplugin_provisioner':
            partial(get_cephfsplugin_provisioner_pods),
            'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods),
            'operator': partial(get_operator_pods)
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_name)
        executor = ThreadPoolExecutor(max_workers=1)

        # Get number of pods of type 'resource_name'
        num_of_resource_pods = len(pod_functions[resource_name]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=self.namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=self.namespace)['items'])

        # Fetch PV names
        pv_objs = []
        for pvc_obj in pvc_objs:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in self.pod_objs:
            pod_info = pod_obj.get()
            node = pod_info['spec']['nodeName']
            pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in self.pod_objs:
            pod_obj.workload_setup(storage_type='fs')
        log.info("Setup for running IO is completed on pods")

        # Start IO on each pod. RWX PVC will be used on two pods. So split the
        # size accordingly
        log.info("Starting IO on pods")
        for pod_obj in self.pod_objs:
            if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX:
                io_size = int((self.pvc_size - 1) / 2)
            else:
                io_size = self.pvc_size - 1
            pod_obj.run_io(storage_type='fs',
                           size=f'{io_size}G',
                           fio_filename=f'{pod_obj.name}_io')
        log.info("IO started on all pods.")

        # Set the daemon to be killed
        disruption.select_daemon()

        # Start deleting pods
        pod_bulk_delete = executor.submit(self.delete_pods)

        if operation_to_disrupt == 'delete_pods':
            ret = self.verify_resource_deletion(get_all_pods,
                                                initial_num_of_pods)
            assert ret, "Wait timeout: Pods are not being deleted."
            log.info("Pods deletion has started.")
            disruption.kill_daemon()

        pods_deleted = pod_bulk_delete.result()

        assert pods_deleted, "Deletion of pods failed."

        # Verify pods are deleted
        for pod_obj in self.pod_objs:
            assert pod_obj.ocp.wait_for_delete(
                pod_obj.name, 180), (f"Pod {pod_obj.name} is not deleted")
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        for node, pvs in node_pv_dict.items():
            cmd = f'oc debug nodes/{node} -- df'
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods.")

        # Fetch image uuid associated with PVCs
        pvc_uuid_map = {}
        for pvc_obj in pvc_objs:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvc_objs)

        if operation_to_disrupt == 'delete_pvcs':
            ret = self.verify_resource_deletion(get_all_pvcs,
                                                initial_num_of_pvc)
            assert ret, "Wait timeout: PVCs are not being deleted."
            log.info("PVCs deletion has started.")
            disruption.kill_daemon()

        pvcs_deleted = pvc_bulk_delete.result()

        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvc_objs:
            assert pvc_obj.ocp.wait_for_delete(
                pvc_obj.name), (f"PVC {pvc_obj.name} is not deleted")
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            assert pv_obj.ocp.wait_for_delete(
                pv_obj.name, 120), (f"PV {pv_obj.name} is not deleted")
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface,
                    image_uuid=uuid,
                    pool_name=sc_obj.ceph_pool.name)
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid)
            assert ret, (f"Volume associated with PVC {pvc_name} still exists "
                         f"in backend")

        # Verify number of pods of type 'resource_name'
        final_num_of_resource_pods = len(pod_functions[resource_name]())
        assert final_num_of_resource_pods == num_of_resource_pods, (
            f"Total number of {resource_name} pods is not matching with "
            f"initial value. Total number of pods before daemon kill: "
            f"{num_of_resource_pods}. Total number of pods present now: "
            f"{final_num_of_resource_pods}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        log.info("Ceph cluster health is OK")
Exemple #24
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        _templating = templating.Templating()

        ceph_cluster = ocp.OCP(
            kind='CephCluster', namespace=self.namespace
        )
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        if not self.ocs_operator_deployment:
            create_oc_resource(
                'common.yaml', self.cluster_path, _templating, config.ENV_DATA
            )
            run_cmd(
                f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
                f'"openshift.io/cluster-monitoring=true"'
            )
            run_cmd(
                f"oc policy add-role-to-user view "
                f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
                f"-n {self.namespace}"
            )
            # HACK: If you would like to drop this hack, make sure that you
            # also updated docs and write appropriate unit/integration tests
            # for config processing.
            if config.ENV_DATA.get('monitoring_enabled') in (
                "true", "True", True
            ):
                # RBAC rules for monitoring, based on documentation change in
                # rook:
                # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8
                # HACK: This should be dropped when OCS is managed by OLM
                apply_oc_resource(
                    'rbac.yaml',
                    self.cluster_path,
                    _templating,
                    config.ENV_DATA,
                    template_dir="monitoring"
                )
            # Increased to 15 seconds as 10 is not enough
            # TODO: do the sampler function and check if resource exist
            wait_time = 15
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            create_oc_resource(
                'operator-openshift.yaml', self.cluster_path,
                _templating, config.ENV_DATA
            )
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            run_cmd(
                f"oc wait --for condition=ready pod "
                f"-l app=rook-ceph-operator "
                f"-n {self.namespace} "
                f"--timeout=120s"
            )
            run_cmd(
                f"oc wait --for condition=ready pod "
                f"-l app=rook-discover "
                f"-n {self.namespace} "
                f"--timeout=120s"
            )
            create_oc_resource(
                'cluster.yaml', self.cluster_path, _templating, config.ENV_DATA
            )
        else:
            self.deploy_ocs_via_operator()

        pod = ocp.OCP(
            kind=constants.POD, namespace=self.namespace
        )
        cfs = ocp.OCP(
            kind=constants.CEPHFILESYSTEM,
            namespace=self.namespace
        )
        # Check for Ceph pods
        assert pod.wait_for_resource(
            condition='Running', selector='app=rook-ceph-mon',
            resource_count=3, timeout=600
        )
        assert pod.wait_for_resource(
            condition='Running', selector='app=rook-ceph-mgr',
            timeout=600
        )
        assert pod.wait_for_resource(
            condition='Running', selector='app=rook-ceph-osd',
            resource_count=3, timeout=600
        )

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector='app=rook-ceph-tools', resource_count=1, timeout=600
        )

        if not self.ocs_operator_deployment:
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            # HACK: This should be dropped (including service-monitor.yaml and
            # prometheus-rules.yaml files) when OCS is managed by OLM
            if config.ENV_DATA.get('monitoring_enabled') not in (
                "true", "True", True
            ):
                # HACK: skip creation of rook-ceph-mgr service monitor when
                # monitoring is enabled (if this were not skipped, the step
                # would fail because rook would create the service monitor at
                # this point already)
                create_oc_resource(
                    "service-monitor.yaml", self.cluster_path, _templating,
                    config.ENV_DATA
                )
                # HACK: skip creation of prometheus-rules, rook-ceph is
                # concerned with it's setup now, based on clarification from
                # Umanga Chapagain
                create_oc_resource(
                    "prometheus-rules.yaml", self.cluster_path, _templating,
                    config.ENV_DATA
                )
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)

            # Create MDS pods for CephFileSystem
            fs_data = templating.load_yaml(constants.CEPHFILESYSTEM_YAML)
            fs_data['metadata']['namespace'] = self.namespace

            ceph_obj = OCS(**fs_data)
            ceph_obj.create()
            assert pod.wait_for_resource(
                condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds',
                resource_count=2, timeout=600
            )

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(
                f"MDS deployment Failed! Please check logs!"
            )

        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get('persistent-monitoring'):
            # Create a pool, secrets and sc
            secret_obj = helpers.create_secret(interface_type=constants.CEPHBLOCKPOOL)
            cbj_obj = helpers.create_ceph_block_pool()
            sc_obj = helpers.create_storage_class(
                interface_type=constants.CEPHBLOCKPOOL,
                interface_name=cbj_obj.name,
                secret_name=secret_obj.name
            )

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager']
            )

            # Create configmap cluster-monitoring-config
            create_configmap_cluster_monitoring_pod(sc_obj.name)

            # Take some time to respin the pod
            waiting_time = 30
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            validate_pods_are_respinned_and_running_state(
                pods_list
            )

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            validate_pvc_are_mounted_on_monitoring_pods(pods_list)

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(
            namespace=self.namespace
        )
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
Exemple #25
0
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    operator_selector = get_selector_for_ocs_operator()
    ocs_package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
    )
    ocs_csv_name = ocs_package_manifest.get_current_csv()
    ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace)
    log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.")
    ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout)

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    # ocs-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OCS_OPERATOR_LABEL,
                                 timeout=timeout)
    # rook-ceph-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OPERATOR_LABEL,
                                 timeout=timeout)
    # noobaa
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.NOOBAA_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)
    # mons
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MON_APP_LABEL,
                                 resource_count=3,
                                 timeout=timeout)
    # csi-cephfsplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_CEPHFSPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-cephfsplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # csi-rbdplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_RBDPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-rbdplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # osds
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OSD_APP_LABEL,
                                 resource_count=osd_count,
                                 timeout=timeout)
    # mgr
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL,
                                 timeout=timeout)
    # mds
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MDS_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)

    # rgw check only for VmWare
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector=constants.RGW_APP_LABEL,
                                     resource_count=1,
                                     timeout=timeout)

    # Verify ceph health
    log.info("Verifying ceph health")
    assert utils.ceph_health_check(namespace=namespace)

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name']
        for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSD's are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSD's are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > 1, (
                "OSD's are not distributed evenly across worker nodes")

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({
        item['metadata']['name']
        for item in csi_driver.get()['items']
    })

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD)
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output.")
    deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]
    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}")
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ])
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and (
                'snapshot' not in image), (
                    f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
    assert {
        'name': 'CSI_ENABLE_SNAPSHOTTER',
        'value': 'false'
    } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec']
          ['template']['spec']['containers'][0]['env']
          ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump',
                                          format='')
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [
            rule for rule in crush_dump['rules']
            if rule['rule_name'] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps']
                if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
Exemple #26
0
def uninstall_ocs():
    """
    The function uninstalls the OCS operator from a openshift
    cluster and removes all its settings and dependencies

    """
    ocp_obj = ocp.OCP()

    log.info("deleting volume snapshots")
    vs_ocp_obj = ocp.OCP(kind=constants.VOLUMESNAPSHOT)
    vs_list = vs_ocp_obj.get(all_namespaces=True)["items"]
    for vs in vs_list:
        vs_obj = ocp.OCP(kind=constants.VOLUMESNAPSHOT,
                         namespace=vs.get("metadata").get("namespace"))
        vs_obj.delete(resource_name=vs.get("metadata").get("name"))

    log.info("queering for OCS PVCs")
    provisioners = constants.OCS_PROVISIONERS
    sc_list = [
        sc for sc in get_all_storageclass()
        if sc.get("provisioner") in provisioners
    ]

    pvc_to_delete = []
    for sc in sc_list:
        pvc_to_delete.extend(pvc for pvc in get_all_pvcs_in_storageclass(
            sc.get("metadata").get("name")) if "noobaa" not in pvc.name)

    if config.ENV_DATA["platform"].lower() == constants.ROSA_PLATFORM:
        log.info("Deleting OCS PVCs")
        for pvc in pvc_to_delete:
            log.info(f"Deleting PVC: {pvc.name}")
            pvc.delete()
        rosa.delete_odf_addon(config.ENV_DATA["cluster_name"])
        return None
    log.info("Removing monitoring stack from OpenShift Container Storage")
    remove_monitoring_stack_from_ocs()

    log.info(
        "Removing OpenShift Container Platform registry from OpenShift Container Storage"
    )
    remove_ocp_registry_from_ocs(config.ENV_DATA["platform"])

    log.info(
        "Removing the cluster logging operator from OpenShift Container Storage"
    )
    try:
        remove_cluster_logging_operator_from_ocs()
    except CommandFailed:
        log.info("No cluster logging found")

    log.info("Deleting OCS PVCs")
    for pvc in pvc_to_delete:
        log.info(f"Deleting PVC: {pvc.name}")
        pvc.delete()

    storage_cluster = ocp.OCP(
        kind=constants.STORAGECLUSTER,
        resource_name=constants.DEFAULT_CLUSTERNAME,
        namespace="openshift-storage",
    )

    log.info("Checking for local storage")
    lso_sc = None
    if check_local_volume_local_volume_set():
        "Local volume was found. Will be removed later"
        lso_sc = (storage_cluster.get().get("spec").get("storageDeviceSets")[0]
                  .get("dataPVCTemplate").get("spec").get("storageClassName"))

    cleanup_policy = (storage_cluster.get().get("metadata").get(
        "annotations").get("uninstall.ocs.openshift.io/cleanup-policy"))

    log.info("Deleting storageCluster object")
    storage_cluster.delete(resource_name=constants.DEFAULT_CLUSTERNAME)

    if cleanup_policy == "delete":
        log.info("Cleanup policy set to delete. checking cleanup pods")
        cleanup_pods = [
            pod for pod in get_all_pods() if "cluster-cleanup-job" in pod.name
        ]
        for pod in cleanup_pods:
            while pod.get().get("status").get("phase") != "Succeeded":
                log.info(f"waiting for cleanup pod {pod.name} to complete")
                TimeoutSampler(timeout=10, sleep=30)
            log.info(f"Cleanup pod {pod.name} completed successfully ")
        # no need to confirm var/vib/rook was deleted from nodes if all cleanup pods are completed.
    else:
        log.info("Cleanup policy set to retain. skipping nodes cleanup")

    log.info("Deleting openshift-storage namespace")
    ocp_obj.delete_project(constants.OPENSHIFT_STORAGE_NAMESPACE)
    ocp_obj.wait_for_delete(constants.OPENSHIFT_STORAGE_NAMESPACE)
    switch_to_project(constants.DEFAULT_NAMESPACE)

    # step 10: TODO remove crypto from nodes.
    """for node in storage_node_list:
        log.info(f"removing encryption from {node}")
        ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=[])"""

    if lso_sc is not None:
        log.info("Removing LSO")
        try:
            uninstall_lso(lso_sc)
        except Exception as e:
            log.info(f"LSO removal failed.{e}")

    log.info("deleting noobaa storage class")
    noobaa_sc = ocp.OCP(kind=constants.STORAGECLASS)
    noobaa_sc.delete(resource_name=constants.NOOBAA_SC)

    nodes = get_all_nodes()
    node_objs = get_node_objs(nodes)

    log.info("Unlabeling storage nodes")
    label_nodes(nodes=node_objs,
                label=constants.OPERATOR_NODE_LABEL[:-3] + "-")
    label_nodes(nodes=node_objs, label=constants.TOPOLOGY_ROOK_LABEL + "-")

    log.info("Removing taints from storage nodes")
    taint_nodes(nodes=nodes, taint_label=constants.OPERATOR_NODE_TAINT + "-")

    log.info("Deleting remaining OCS PVs (if there are any)")
    try:
        rbd_pv = ocp.OCP(kind=constants.PV,
                         resource_name="ocs-storagecluster-ceph-rbd")
        fs_pv = ocp.OCP(kind=constants.PV,
                        resource_name="ocs-storagecluster-cephfs")
        rbd_pv.delete()
        fs_pv.delete()
        log.info("OCS PVs deleted")
    except Exception as e:
        log.info(f"OCS PV(s) not found. {e}")

    log.info("Removing CRDs")
    crd_list = [
        "backingstores.noobaa.io",
        "bucketclasses.noobaa.io",
        "cephblockpools.ceph.rook.io",
        "cephclusters.ceph.rook.io",
        "cephfilesystems.ceph.rook.io",
        "cephnfses.ceph.rook.io",
        "cephobjectstores.ceph.rook.io",
        "cephobjectstoreusers.ceph.rook.io",
        "noobaas.noobaa.io",
        "ocsinitializations.ocs.openshift.io",
        "storageclusters.ocs.openshift.io",
        "cephclients.ceph.rook.io",
        "cephobjectrealms.ceph.rook.io",
        "cephobjectzonegroups.ceph.rook.io",
        "cephobjectzones.ceph.rook.io",
        "cephrbdmirrors.ceph.rook.io",
    ]

    for crd in crd_list:
        try:
            ocp_obj.exec_oc_cmd(f"delete crd {crd} --timeout=300m")
        except Exception:
            log.info(f"crd {crd} was not found")
Exemple #27
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")
        self.deploy_ocs_via_operator()
        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # validate PDB creation of MON, MDS, OSD pods
        validate_pdb_creation()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info("MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error("MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):

            sc = helpers.default_storage_class(
                interface_type=constants.CEPHBLOCKPOOL)

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config and reconfigure
            # storage class and telemeter server (if the url is specified in a
            # config file)
            create_configmap_cluster_monitoring_pod(
                sc_name=sc.name,
                telemeter_server_url=config.ENV_DATA.get(
                    "telemeter_server_url"))

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=3,
                  delay=15)(validate_pods_are_respinned_and_running_state)(
                      pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            retry((CommandFailed, AssertionError), tries=3, delay=15)(
                validate_pvc_are_mounted_on_monitoring_pods)(pods_list)
        elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                "telemeter_server_url"):
            # Create configmap cluster-monitoring-config to reconfigure
            # telemeter server url when 'persistent-monitoring' is False
            create_configmap_cluster_monitoring_pod(
                telemeter_server_url=config.ENV_DATA["telemeter_server_url"])

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        try:
            ceph_health_check(namespace=self.namespace, tries=30, delay=10)
        except CephHealthException as ex:
            err = str(ex)
            logger.warning(f"Ceph health check failed with {err}")
            if "clock skew detected" in err:
                logger.info(f"Changing NTP on compute nodes to"
                            f" {constants.RH_NTP_CLOCK}")
                update_ntp_compute_nodes()
                assert ceph_health_check(namespace=self.namespace,
                                         tries=60,
                                         delay=10)

        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
        if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU,
                             min_memory=constants.MIN_NODE_MEMORY):
            logger.info("The cluster specs meet the minimum requirements and "
                        "therefore, NooBaa auto scale will be enabled")
            min_nb_eps = config.DEPLOYMENT.get('min_noobaa_endpoints')
            max_nb_eps = config.DEPLOYMENT.get('max_noobaa_endpoints')
            change_noobaa_endpoints_count(min_nb_eps=min_nb_eps,
                                          max_nb_eps=max_nb_eps)
        else:
            logger.warning(
                "The cluster specs do not meet the minimum requirements"
                " and therefore, NooBaa auto scale will remain disabled")
            change_noobaa_endpoints_count(min_nb_eps=1, max_nb_eps=1)
Exemple #28
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")
    if config.ENV_DATA.get("disable_components"):
        for component in config.ENV_DATA["disable_components"]:
            config.COMPONENTS[f"disable_{component}"] = True
    disable_noobaa = config.COMPONENTS["disable_noobaa"]
    disable_rgw = config.COMPONENTS["disable_rgw"]
    disable_blockpools = config.COMPONENTS["disable_blockpools"]
    disable_cephfs = config.COMPONENTS["disable_cephfs"]

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = version.get_semantic_ocs_version_from_config()
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        f"{ocs_version}" in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.rsplit(":", 1)[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify Storage System status
    if ocs_version >= version.VERSION_4_9:
        log.info("Verifying storage system status")
        storage_system = OCP(kind=constants.STORAGESYSTEM, namespace=namespace)
        storage_system_data = storage_system.get()
        storage_system_status = {}
        for condition in storage_system_data["items"][0]["status"][
                "conditions"]:
            storage_system_status[condition["type"]] = condition["status"]
        log.debug(f"storage system status: {storage_system_status}")
        assert storage_system_status == constants.STORAGE_SYSTEM_STATUS, (
            f"Storage System status is not in expected state. Expected {constants.STORAGE_SYSTEM_STATUS}"
            f" but found {storage_system_status}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        if not disable_rgw:
            rgw_count = get_rgw_count(f"{ocs_version}",
                                      post_upgrade_verification,
                                      version_before_upgrade)

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if ocs_version >= version.VERSION_4_6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER
                   if ocs_version < version.VERSION_4_7 else
                   constants.NOOBAA_DB_LABEL_47_AND_ABOVE)
    resources_dict = {
        nb_db_label: 1,
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    if ocs_version >= version.VERSION_4_9:
        resources_dict.update({
            constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if (not config.ENV_DATA.get("platform")
                    in constants.ON_PREM_PLATFORMS or disable_rgw):
                continue
        if "noobaa" in label and disable_noobaa:
            continue
        if "mds" in label and disable_cephfs:
            continue

        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    if not disable_noobaa:
        nb_ep_pods = get_pods_having_label(
            label=constants.NOOBAA_ENDPOINT_POD_LABEL,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        )
        assert len(nb_ep_pods) <= max_eps, (
            f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
            f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
        )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if ocs_version >= version.VERSION_4_10:
        # TODO: Add rbd-thick storage class verification in external mode cluster upgraded
        # to OCS 4.8 when the bug 1978542 is fixed
        # Skip rbd-thick storage class verification in external mode upgraded cluster. This is blocked by bug 1978542
        if not (config.DEPLOYMENT["external_mode"]
                and post_upgrade_verification):
            required_storage_classes.update(
                {f"{storage_cluster_name}-ceph-rbd-thick"})
    skip_storage_classes = set()
    if disable_cephfs:
        skip_storage_classes.update({
            f"{storage_cluster_name}-cephfs",
        })
    if disable_blockpools:
        skip_storage_classes.update({
            f"{storage_cluster_name}-ceph-rbd",
        })
    required_storage_classes = required_storage_classes.difference(
        skip_storage_classes)

    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    # required storage class names should be observed in the cluster under test
    missing_scs = required_storage_classes.difference(storage_class_names)
    if len(missing_scs) > 0:
        log.error("few storage classess are not present: %s", missing_scs)
    assert list(missing_scs) == []

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        if not disable_blockpools:
            sc_rbd = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        if not disable_cephfs:
            sc_cephfs = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    if not disable_blockpools:
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    if not disable_cephfs:
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/node-stage-secret-name"] ==
                constants.CEPHFS_NODE_SECRET)
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/provisioner-secret-name"] ==
                constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    ct_pod = get_ceph_tools_pod()

    # https://github.com/red-hat-storage/ocs-ci/issues/3820
    # Verify ceph osd tree output
    if not (config.DEPLOYMENT.get("ui_deployment")
            or config.DEPLOYMENT["external_mode"]):
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")
        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()]
            # removes duplicate hostname
            deviceset_pvcs = list(set(deviceset_pvcs))
            if config.ENV_DATA.get("platform") == constants.BAREMETAL_PLATFORM:
                deviceset_pvcs = [
                    deviceset.replace(".", "-") for deviceset in deviceset_pvcs
                ]
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if ocs_version < version.VERSION_4_6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()
        if config.DEPLOYMENT.get("kms_deployment"):
            kms = KMS.get_kms_deployment()
            kms.post_deploy_verification()

    storage_cluster_obj = get_storage_cluster()
    is_flexible_scaling = (
        storage_cluster_obj.get()["items"][0].get("spec").get(
            "flexibleScaling", False))
    if is_flexible_scaling is True:
        failure_domain = storage_cluster_obj.data["items"][0]["status"][
            "failureDomain"]
        assert failure_domain == "host", (
            f"The expected failure domain on cluster with flexible scaling is 'host',"
            f" the actaul failure domain is {failure_domain}")

    if ocs_version >= version.VERSION_4_7:
        log.info("Verifying images in storage cluster")
        verify_sc_images(storage_cluster)

    if config.ENV_DATA.get("is_multus_enabled"):
        verify_multus_network()
    def test_pvc_disruptive(self, interface, operation_to_disrupt,
                            resource_to_delete, multi_pvc_factory,
                            pod_factory):
        """
        Base function for PVC disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            'mds':
            partial(pod.get_mds_pods),
            'mon':
            partial(pod.get_mon_pods),
            'mgr':
            partial(pod.get_mgr_pods),
            'osd':
            partial(pod.get_osd_pods),
            'rbdplugin':
            partial(pod.get_plugin_pods, interface=interface),
            'cephfsplugin':
            partial(pod.get_plugin_pods, interface=interface),
            'cephfsplugin_provisioner':
            partial(pod.get_cephfsplugin_provisioner_pods),
            'rbdplugin_provisioner':
            partial(pod.get_rbdfsplugin_provisioner_pods),
            'operator':
            partial(pod.get_operator_pods)
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        num_of_pvc = 12
        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)['items'])

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        DISRUPTION_OPS.set_resource(resource=resource_to_delete)

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend([
                f'{constants.ACCESS_MODE_RWO}-Block',
                f'{constants.ACCESS_MODE_RWX}-Block'
            ])

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=self.proj_obj,
            size=5,
            access_modes=access_modes,
            access_modes_selection='distribute_random',
            status=constants.STATUS_BOUND,
            num_of_pvc=num_of_pvc,
            wait_each=False)

        if operation_to_disrupt == 'create_pvc':
            # Ensure PVCs are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                get_all_pvcs, initial_num_of_pvc, namespace, 'increase')
            assert ret, "Wait timeout: PVCs are not being created."
            logger.info(f"PVCs creation has started.")
            DISRUPTION_OPS.delete_resource()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)
            pvc_obj.reload()
        logger.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs,
                                          pod_factory, interface, 2)

        if operation_to_disrupt == 'create_pod':
            # Ensure that pods are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                pod.get_all_pods, initial_num_of_pods, namespace, 'increase')
            assert ret, "Wait timeout: Pods are not being created."
            logger.info(f"Pods creation has started.")
            DISRUPTION_OPS.delete_resource()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        logger.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        logger.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            for sample in TimeoutSampler(180, 2, getattr, pod_obj,
                                         'wl_setup_done'):
                if sample:
                    logger.info(f"Setup for running IO is completed on pod "
                                f"{pod_obj.name}.")
                    break
        logger.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(storage_type=storage_type,
                           size='1G',
                           runtime=10,
                           fio_filename=f'{pod_obj.name}_io_file1')
        logger.info("FIO started on all pods.")

        if operation_to_disrupt == 'run_io':
            DISRUPTION_OPS.delete_resource()

        logger.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}")
        logger.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2)

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        logging.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(storage_type=storage_type,
                           size='1G',
                           runtime=10,
                           fio_filename=f'{pod_obj.name}_io_file2')

        logger.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}")
        logger.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        logger.info("Ceph cluster health is OK")
    def test_non_ocs_taint_and_tolerations(self):
        """
        Test runs the following steps
        1. Taint ocs nodes with non-ocs taint
        2. Set tolerations on storagecluster, subscription, configmap and ocsinit
        3. Respin all ocs pods and check if it runs on ocs nodes with tolerations
        4. Add Capacity

        """

        # Taint all nodes with non-ocs taint
        ocs_nodes = get_worker_nodes()
        taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule")

        # Add tolerations to the storagecluster
        storagecluster_obj = ocp.OCP(
            resource_name=constants.DEFAULT_CLUSTERNAME,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.STORAGECLUSTER,
        )
        tolerations = (
            '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",'
            '"operator": "Equal", "value": "true"}, '
            '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", '
            '"operator": "Equal", "value": "true"}]}')
        param = (
            f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, '
            f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}')
        storagecluster_obj.patch(params=param, format_type="merge")

        # Add tolerations to the subscription
        sub_list = ocp.get_all_resource_names_of_a_kind(
            kind=constants.SUBSCRIPTION)
        param = (
            '{"spec": {"config":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}}')
        for sub in sub_list:
            sub_obj = ocp.OCP(
                resource_name=sub,
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                kind=constants.SUBSCRIPTION,
            )
            sub_obj.patch(params=param, format_type="merge")

        # Add tolerations to the ocsinitializations.ocs.openshift.io
        param = (
            '{"spec":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}')

        ocsini_obj = ocp.OCP(
            resource_name=constants.OCSINIT,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.OCSINITIALIZATION,
        )
        ocsini_obj.patch(params=param, format_type="merge")

        # Add tolerations to the configmap rook-ceph-operator-config
        configmap_obj = ocp.OCP(
            kind=constants.CONFIGMAP,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
        )
        toleration = configmap_obj.get().get("data").get(
            "CSI_PLUGIN_TOLERATIONS")
        toleration += (
            '\n- key: xyz\n  operator: Equal\n  value: "true"\n  effect: NoSchedule'
        )
        toleration = toleration.replace('"', '\\"').replace("\n", "\\n")
        param_cmd = (
            f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, '
            f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]'
        )
        configmap_obj.patch(params=param_cmd, format_type="json")

        # After edit noticed few pod respins as expected
        assert wait_for_pods_to_be_running(timeout=600, sleep=15)

        # Respin all pods and check it if is still running
        pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, )
        for pod in pod_list:
            pod.delete(wait=False)

        assert wait_for_pods_to_be_running(timeout=600, sleep=15)
        self.sanity_helpers.health_check()

        # Add capacity to check if new osds has toleration
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = ocp.OCP(kind=constants.POD,
                      namespace=config.ENV_DATA["cluster_namespace"])
        if is_flexible_scaling_enabled():
            replica_count = 1
        else:
            replica_count = 3
        assert pod.wait_for_resource(
            timeout=300,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=count * replica_count,
        ), "New OSDs failed to reach running state"
        check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)