Ejemplo n.º 1
0
    def test_registry_respin_pod(self, pod_name, iterations):
        """
        Test registry workload when backed by OCS respin of ceph pods
        """

        # Respin relevant pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f'{pod_name}')
        disruption.delete_resource()

        # Start SVT workload for pushing images to registry
        svt_setup(iterations=iterations)

        # Image pull and push to registry
        image_pull(image_url=IMAGE_URL)
        self.image_path = image_push(
            image_url=IMAGE_URL, namespace=OPENSHIFT_IMAGE_REGISTRY_NAMESPACE
        )

        # List the images in registry
        img_list = image_list_all()
        log.info(f"Image list {img_list}")

        # Check either image present in registry or not
        validate = check_image_exists_in_registry(image_url=IMAGE_URL)
        if not validate:
            raise UnexpectedBehaviour("Image URL not present in registry")

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check()
Ejemplo n.º 2
0
    def test_registry_respin_pod(self, pod_name):
        """
        Test registry workload when backed by OCS respin of ceph pods
        """

        # Respin relevant pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f'{pod_name}')
        disruption.delete_resource()

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template='eap-cd-basic-s2i',
            image=
            'registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest',
            pattern='eap-app')

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check()
    def disrupt_plugin_provisioner_pods(self, node_list):
        """
        Set leader plugin-provisioner resources for disruption, skip if running
        on node from the node_list

        Args:
            node_list (list): list of node names to check

        Returns:
            list: list of Disruption objects
        """
        provisioner_resource = []
        for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]:
            provisioner_pod = pod.plugin_provisioner_leader(interface=interface)
            node_name = pod.get_pod_node(provisioner_pod).name
            if node_name not in node_list:
                if interface == constants.CEPHBLOCKPOOL:
                    provisioner_resource.append('rbdplugin_provisioner')
                else:
                    provisioner_resource.append('cephfsplugin_provisioner')

        disruptor = []
        for resource in provisioner_resource:
            disruption = disruption_helpers.Disruptions()
            disruption.set_resource(resource=resource)
            disruptor.append(disruption)

        return disruptor
Ejemplo n.º 4
0
    def test_run_pgsql(self, pgsql, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Respin Ceph pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f'{pod_name}')
        disruption.delete_resource()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
    def test_respin_osd_pods_to_verify_logging(self, create_pvc_and_deploymentconfig_pod):
        """
        This function creates projects before and after respin of osd
        and verify project existence in EFK stack.
        1. Creates new project with PVC and app-pods
        2. Respins osd
        3. Logs into the EFK stack and checks for the health of cluster-logging
        4. Logs into the EFK stack and checks project existence
        5. Checks for the shards of the project in the EFK stack
        6. Creates new project and checks the existence again
        """

        # Create 1st project and app_pod
        dc_pod_obj, dc_pvc_obj = create_pvc_and_deploymentconfig_pod

        project1 = dc_pvc_obj.project.namespace

        # Delete the OSD pod
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource='osd')
        disruption.delete_resource()

        # Check the health of the cluster-logging
        assert ocp_logging_obj.check_health_of_clusterlogging()

        # Check for the 1st project created in EFK stack before the respin
        self.validate_project_exists(dc_pvc_obj)

        # Check the files in the project
        elasticsearch_pod_obj = self.get_elasticsearch_pod_obj()

        project1_filecount = elasticsearch_pod_obj.exec_cmd_on_pod(
            command=f'es_util --query=project.{project1}.*/_count'
        )
        assert project1_filecount['_shards']['successful'] != 0, (
            f"No files found in project {project1}"
        )
        logger.info(f'Total number of files in project 1 {project1_filecount}')

        # Create another app_pod in new project
        pod_obj, pvc_obj = create_pvc_and_deploymentconfig_pod

        project2 = pvc_obj.project.namespace

        # Check the 2nd project exists in the EFK stack
        self.validate_project_exists(pvc_obj)

        project2_filecount = elasticsearch_pod_obj.exec_cmd_on_pod(
            command=f'es_util --query=project.{project2}.*/_count', out_yaml_format=True
        )
        assert project2_filecount['_shards']['successful'] != 0, (
            f"No files found in project {project2}"
        )
        logger.info(f'Total number of files in the project 2 {project2_filecount}')
Ejemplo n.º 6
0
    def test_monitoring_after_respinning_ceph_pods(self, test_fixture):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ['mgr', 'mon', 'osd']
        disruption = disruption_helpers.Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics after respinning ceph pods
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after the respinning ceph pods
        namespaces = helpers.create_multilpe_projects(number_of_project=2)
        namespace_list.extend(namespaces)

        # Create pvcs after the respinning ceph pods
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after the respinning ceph pods
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )
Ejemplo n.º 7
0
    def respin_ceph_pod(self, resource_to_delete):
        """
        Function to respin ceph pods one by one,
        delete_resource functions checks for the deleted pod back up and running

        Args:
            resource_to_delete (str): Ceph resource type to be deleted, eg: mgr/mon/osd/mds
        """
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        no_of_resource = disruption.resource_count
        for i in range(0, no_of_resource):
            disruption.delete_resource(resource_id=i)
Ejemplo n.º 8
0
    def test_pv_scale_out_create_pvcs_and_respin_ceph_pods(
        self, fioscale, resource_to_delete,
    ):
        """
        Test case to scale PVC+POD with multi projects and reach expected PVC count
        """

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        no_of_resource = disruption.resource_count
        for i in range(0, no_of_resource):
            disruption.delete_resource(resource_id=i)

        utils.ceph_health_check()
Ejemplo n.º 9
0
    def test_run_pgsql_respin_pod(self, pgsql, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)
        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization(adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Respin relevant pod
        if pod_name == 'postgers':
            pgsql.respin_pgsql_app_pod()
        else:
            log.info(f"Respin Ceph pod {pod_name}")
            disruption = disruption_helpers.Disruptions()
            disruption.set_resource(resource=f'{pod_name}')
            disruption.delete_resource()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
Ejemplo n.º 10
0
    def test_respin_osd_pods_to_verify_logging(
            self, create_pvc_and_deploymentconfig_pod):
        """
        This function creates projects before and after respin of osd
        and verify project existence in EFK stack.
        1. Creates new project with PVC and app-pods
        2. Respins osd
        3. Logs into the EFK stack and checks for the health of cluster-logging
        4. Logs into the EFK stack and checks project existence
        5. Checks for the shards of the project in the EFK stack
        6. Creates new project and checks the existence again
        """

        # Create 1st project and app_pod
        dc_pod_obj, dc_pvc_obj = create_pvc_and_deploymentconfig_pod

        project1 = dc_pvc_obj.project.namespace

        # Delete the OSD pod
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource='osd')
        disruption.delete_resource()

        # Check the health of the cluster-logging
        assert ocp_logging_obj.check_health_of_clusterlogging()

        # Check for the 1st project created in EFK stack before the respin
        self.validate_project_exists(project1)

        # Check the files in the project
        self.check_filecount_in_project(project1)

        # Create another app_pod in new project
        pod_obj, pvc_obj = create_pvc_and_deploymentconfig_pod

        project2 = pvc_obj.project.namespace

        # Check the 2nd project exists in the EFK stack
        self.validate_project_exists(project2)

        self.check_filecount_in_project(project2)
Ejemplo n.º 11
0
    def test_run_jenkins_respin_pod(self, jenkins, pod_name, num_projects,
                                    num_of_builds):
        """
        Test jenkins workload
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        # Respin pod
        log.info(f"Respin pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f'{pod_name}')
        disruption.delete_resource()

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()
Ejemplo n.º 12
0
import logging
from concurrent.futures import ThreadPoolExecutor
import pytest
from functools import partial

from ocs_ci.framework.testlib import ManageTest, tier4, tier4a
from ocs_ci.framework import config
from ocs_ci.ocs import constants
from ocs_ci.ocs.resources.pvc import get_all_pvcs
from ocs_ci.ocs.resources import pod
from ocs_ci.utility.utils import TimeoutSampler, ceph_health_check
from tests import helpers, disruption_helpers

logger = logging.getLogger(__name__)

DISRUPTION_OPS = disruption_helpers.Disruptions()


@tier4
@tier4a
@pytest.mark.parametrize(
    argnames=["interface", "operation_to_disrupt", "resource_to_delete"],
    argvalues=[
        pytest.param(*[constants.CEPHBLOCKPOOL, 'create_pvc', 'mgr'],
                     marks=pytest.mark.polarion_id("OCS-568")),
        pytest.param(*[constants.CEPHBLOCKPOOL, 'create_pod', 'mgr'],
                     marks=pytest.mark.polarion_id("OCS-569")),
        pytest.param(*[constants.CEPHBLOCKPOOL, 'run_io', 'mgr'],
                     marks=pytest.mark.polarion_id("OCS-570")),
        pytest.param(*[constants.CEPHBLOCKPOOL, 'create_pvc', 'mon'],
                     marks=pytest.mark.polarion_id("OCS-561")),
Ejemplo n.º 13
0
    def test_ceph_daemon_kill_during_pod_pvc_deletion(self, interface,
                                                      operation_to_disrupt,
                                                      resource_name,
                                                      setup_base):
        """
        Kill 'resource_name' daemon while deletion of PVCs/pods is progressing
        """
        pvc_objs, self.pod_objs = setup_base
        sc_obj = pvc_objs[0].storageclass
        self.namespace = pvc_objs[0].project.namespace
        pod_functions = {
            'mds': partial(get_mds_pods),
            'mon': partial(get_mon_pods),
            'mgr': partial(get_mgr_pods),
            'osd': partial(get_osd_pods),
            'rbdplugin': partial(get_plugin_pods, interface=interface),
            'cephfsplugin': partial(get_plugin_pods, interface=interface),
            'cephfsplugin_provisioner':
            partial(get_cephfsplugin_provisioner_pods),
            'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods),
            'operator': partial(get_operator_pods)
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_name)
        executor = ThreadPoolExecutor(max_workers=1)

        # Get number of pods of type 'resource_name'
        num_of_resource_pods = len(pod_functions[resource_name]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=self.namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=self.namespace)['items'])

        # Fetch PV names
        pv_objs = []
        for pvc_obj in pvc_objs:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in self.pod_objs:
            pod_info = pod_obj.get()
            node = pod_info['spec']['nodeName']
            pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in self.pod_objs:
            pod_obj.workload_setup(storage_type='fs')
        log.info("Setup for running IO is completed on pods")

        # Start IO on each pod. RWX PVC will be used on two pods. So split the
        # size accordingly
        log.info("Starting IO on pods")
        for pod_obj in self.pod_objs:
            if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX:
                io_size = int((self.pvc_size - 1) / 2)
            else:
                io_size = self.pvc_size - 1
            pod_obj.run_io(storage_type='fs',
                           size=f'{io_size}G',
                           fio_filename=f'{pod_obj.name}_io')
        log.info("IO started on all pods.")

        # Set the daemon to be killed
        disruption.select_daemon()

        # Start deleting pods
        pod_bulk_delete = executor.submit(self.delete_pods)

        if operation_to_disrupt == 'delete_pods':
            ret = self.verify_resource_deletion(get_all_pods,
                                                initial_num_of_pods)
            assert ret, "Wait timeout: Pods are not being deleted."
            log.info("Pods deletion has started.")
            disruption.kill_daemon()

        pods_deleted = pod_bulk_delete.result()

        assert pods_deleted, "Deletion of pods failed."

        # Verify pods are deleted
        for pod_obj in self.pod_objs:
            assert pod_obj.ocp.wait_for_delete(
                pod_obj.name, 180), (f"Pod {pod_obj.name} is not deleted")
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        for node, pvs in node_pv_dict.items():
            cmd = f'oc debug nodes/{node} -- df'
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods.")

        # Fetch image uuid associated with PVCs
        pvc_uuid_map = {}
        for pvc_obj in pvc_objs:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvc_objs)

        if operation_to_disrupt == 'delete_pvcs':
            ret = self.verify_resource_deletion(get_all_pvcs,
                                                initial_num_of_pvc)
            assert ret, "Wait timeout: PVCs are not being deleted."
            log.info("PVCs deletion has started.")
            disruption.kill_daemon()

        pvcs_deleted = pvc_bulk_delete.result()

        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvc_objs:
            assert pvc_obj.ocp.wait_for_delete(
                pvc_obj.name), (f"PVC {pvc_obj.name} is not deleted")
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            assert pv_obj.ocp.wait_for_delete(
                pv_obj.name, 120), (f"PV {pv_obj.name} is not deleted")
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface,
                    image_uuid=uuid,
                    pool_name=sc_obj.ceph_pool.name)
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid)
            assert ret, (f"Volume associated with PVC {pvc_name} still exists "
                         f"in backend")

        # Verify number of pods of type 'resource_name'
        final_num_of_resource_pods = len(pod_functions[resource_name]())
        assert final_num_of_resource_pods == num_of_resource_pods, (
            f"Total number of {resource_name} pods is not matching with "
            f"initial value. Total number of pods before daemon kill: "
            f"{num_of_resource_pods}. Total number of pods present now: "
            f"{final_num_of_resource_pods}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        log.info("Ceph cluster health is OK")
    def test_daemon_kill_during_pvc_pod_creation_and_io(
        self, interface, resource_name, setup, multi_pvc_factory,
        pod_factory
    ):
        """
        Kill 'resource_name' daemon while PVCs creation, pods
        creation and IO operation are progressing.
        """
        num_of_new_pvcs = 5
        pvc_objs, io_pods, pvc_objs_new_pods, access_modes = setup
        proj_obj = pvc_objs[0].project
        storageclass = pvc_objs[0].storageclass

        pod_functions = {
            'mds': partial(get_mds_pods), 'mon': partial(get_mon_pods),
            'mgr': partial(get_mgr_pods), 'osd': partial(get_osd_pods),
            'rbdplugin': partial(get_plugin_pods, interface=interface),
            'cephfsplugin': partial(get_plugin_pods, interface=interface),
            'cephfsplugin_provisioner': partial(get_cephfsplugin_provisioner_pods),
            'rbdplugin_provisioner': partial(get_rbdfsplugin_provisioner_pods),
            'operator': partial(get_operator_pods)
        }

        executor = ThreadPoolExecutor(max_workers=len(io_pods))

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_name)

        # Get number of pods of type 'resource_name'
        resource_pods_num = len(pod_functions[resource_name]())

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in io_pods:
            if pod_obj.pvc.volume_mode == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in io_pods:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(
                180, 2, getattr, pod_obj, 'wl_setup_done'
            ):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod "
                        f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on pods")

        # Set daemon to be killed
        disruption.select_daemon()

        # Start creating new pods
        log.info("Start creating new pods.")
        bulk_pod_create = executor.submit(
            helpers.create_pods, pvc_objs_new_pods, pod_factory, interface, 2
        )

        # Start creation of new PVCs
        log.info("Start creating new PVCs.")
        bulk_pvc_create = executor.submit(
            multi_pvc_factory, interface=interface,
            project=proj_obj, storageclass=storageclass, size=self.pvc_size,
            access_modes=access_modes,
            access_modes_selection='distribute_random',
            status="", num_of_pvc=num_of_new_pvcs, wait_each=False
        )

        # Start IO on each pod
        log.info("Start IO on pods")
        for pod_obj in io_pods:
            if pod_obj.pvc.volume_mode == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(
                storage_type=storage_type, size='1G', runtime=10,
                fio_filename=f'{pod_obj.name}_io_file1'
            )
        log.info("IO started on all pods.")

        # Kill daemon
        disruption.kill_daemon()

        # Getting result of PVC creation as list of PVC objects
        pvc_objs_new = bulk_pvc_create.result()

        # Confirm PVCs are Bound
        for pvc_obj in pvc_objs_new:
            helpers.wait_for_resource_state(
                resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180
            )
            pvc_obj.reload()
        log.info("Verified: New PVCs are Bound.")

        # Getting result of pods creation as list of Pod objects
        pod_objs_new = bulk_pod_create.result()

        # Verify new pods are Running
        for pod_obj in pod_objs_new:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING
            )
            pod_obj.reload()
        log.info("Verified: All new pods are Running.")

        # Verify IO
        log.info("Fetching IO results from IO pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            )
            log.info(f"IOPs after FIO on pod {pod_obj.name}:")
            log.info(
                f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
            )
            log.info(
                f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
            )
        log.info("Verified IO result on IO pods.")

        all_pod_objs = io_pods + pod_objs_new

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod in all_pod_objs:
            pod_info = pod.get()
            node = pod_info['spec']['nodeName']
            pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim']['claimName']
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Delete pods
        for pod_obj in all_pod_objs:
            pod_obj.delete(wait=False)

        # Verify pods are deleted
        for pod_obj in all_pod_objs:
            pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name)

        # Verify number of 'resource_name' type pods
        final_resource_pods_num = len(pod_functions[resource_name]())
        assert final_resource_pods_num == resource_pods_num, (
            f"Total number of {resource_name} pods is not matching with "
            f"initial value. Total number of pods before daemon kill: "
            f"{resource_pods_num}. Total number of pods present now: "
            f"{final_resource_pods_num}"
        )

        # Verify volumes are unmapped from nodes after deleting the pods
        node_pv_mounted = helpers.verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods"
        )

        # Set volume mode on PVC objects
        for pvc_obj in pvc_objs_new:
            pvc_info = pvc_obj.get()
            setattr(pvc_obj, 'volume_mode', pvc_info['spec']['volumeMode'])

        # Verify that PVCs are reusable by creating new pods
        all_pvc_objs = pvc_objs + pvc_objs_new
        pod_objs_re = helpers.create_pods(
            all_pvc_objs, pod_factory, interface, 2
        )

        # Verify pods are Running
        for pod_obj in pod_objs_re:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING
            )
            pod_obj.reload()
        log.info("Successfully created new pods using all PVCs.")

        # Run IO on each of the newly created pods
        for pod_obj in pod_objs_re:
            if pod_obj.pvc.volume_mode == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(
                storage_type=storage_type, size='1G', runtime=10,
                fio_filename=f'{pod_obj.name}_io_file2'
            )

        log.info("Fetching IO results from newly created pods")
        for pod_obj in pod_objs_re:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            )
            log.info(f"IOPs after FIO on pod {pod_obj.name}:")
            log.info(
                f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}"
            )
            log.info(
                f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}"
            )
        log.info("Verified IO result on newly created pods.")
    def test_disruptive_during_pod_pvc_deletion_and_io(
        self, interface, resource_to_delete,
        setup_base
    ):
        """
        Delete ceph/rook pod while PVCs deletion, pods deletion and IO are
        progressing
        """
        pvc_objs, pod_objs, rwx_pod_objs = setup_base
        sc_obj = pvc_objs[0].storageclass
        namespace = pvc_objs[0].project.namespace

        num_of_pods_to_delete = 10
        num_of_io_pods = 5

        # Select pods to be deleted
        pods_to_delete = pod_objs[:num_of_pods_to_delete]
        pods_to_delete.extend(
            [pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (
                pod_obj.pvc == pod.pvc
            )]
        )

        # Select pods to run IO
        io_pods = pod_objs[num_of_pods_to_delete:num_of_pods_to_delete + num_of_io_pods]
        io_pods.extend(
            [pod for pod in rwx_pod_objs for pod_obj in io_pods if (
                pod_obj.pvc == pod.pvc
            )]
        )

        # Select pods which are having PVCs to delete
        pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods:]
        pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc]
        pods_for_pvc.extend(
            [pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (
                pod_obj.pvc == pod.pvc
            )]
        )

        log.info(
            f"{len(pods_to_delete)} pods selected for deletion in which "
            f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod "
            f"share same RWX PVC"
        )
        log.info(
            f"{len(io_pods)} pods selected for running IO in which "
            f"{len(io_pods) - num_of_io_pods} pairs of pod share same "
            f"RWX PVC"
        )
        no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete)
        log.info(
            f"{len(pvcs_to_delete)} PVCs selected for deletion. "
            f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, "
            f"RWX PVCs: {no_of_rwx_pvcs_delete}"
        )

        pod_functions = {
            'mds': get_mds_pods, 'mon': get_mon_pods, 'mgr': get_mgr_pods,
            'osd': get_osd_pods
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=len(pod_objs))

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=namespace)['items']
        )

        # Fetch PV names to verify after deletion
        pv_objs = []
        for pvc_obj in pvcs_to_delete:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in pods_to_delete:
            pod_info = pod_obj.get()
            node = pod_info['spec']['nodeName']
            pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim']['claimName']
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Fetch image uuid associated with PVCs to be deleted
        pvc_uuid_map = {}
        for pvc_obj in pvcs_to_delete:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs + rwx_pod_objs:
            executor.submit(pod_obj.workload_setup, storage_type='fs')

        # Wait for setup on pods to complete
        for pod_obj in pod_objs + rwx_pod_objs:
            for sample in TimeoutSampler(
                180, 2, getattr, pod_obj, 'wl_setup_done'
            ):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod "
                        f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on pods having PVCs to delete to load data
        log.info("Starting IO on pods having PVCs to delete.")
        self.run_io_on_pods(pods_for_pvc)
        log.info("IO started on pods having PVCs to delete.")

        log.info("Fetching IO results from the pods having PVCs to delete.")
        for pod_obj in pods_for_pvc:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods having PVCs to delete.")

        # Delete pods having PVCs to delete.
        assert self.delete_pods(pods_for_pvc), (
            "Couldn't delete pods which are having PVCs to delete."
        )
        for pod_obj in pods_for_pvc:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Deleted pods which are having PVCs to delete.")

        # Start IO on pods to be deleted
        log.info("Starting IO on pods to be deleted.")
        self.run_io_on_pods(pods_to_delete)
        log.info("IO started on pods to be deleted.")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete)
        log.info("Started deleting PVCs")

        # Start deleting pods
        pod_bulk_delete = executor.submit(self.delete_pods, pods_to_delete)
        log.info("Started deleting pods")

        # Start IO on IO pods
        self.run_io_on_pods(io_pods)
        log.info("Started IO on IO pods")

        # Verify pvc deletion has started
        pvc_deleting = executor.submit(
            wait_for_resource_count_change, func_to_use=get_all_pvcs,
            previous_num=initial_num_of_pvc, namespace=namespace,
            change_type='decrease', min_difference=1, timeout=30, interval=0.01
        )

        # Verify pod deletion has started
        pod_deleting = executor.submit(
            wait_for_resource_count_change, func_to_use=get_all_pods,
            previous_num=initial_num_of_pods, namespace=namespace,
            change_type='decrease', min_difference=1, timeout=30, interval=0.01
        )

        assert pvc_deleting.result(), (
            "Wait timeout: PVCs are not being deleted."
        )
        log.info("PVCs deletion has started.")

        assert pod_deleting.result(), (
            "Wait timeout: Pods are not being deleted."
        )
        log.info("Pods deletion has started.")

        # Delete pod of type 'resource_to_delete'
        disruption.delete_resource()

        pods_deleted = pod_bulk_delete.result()
        assert pods_deleted, "Deletion of pods failed."

        # Verify pods are deleted
        for pod_obj in pods_to_delete:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods"
        )

        pvcs_deleted = pvc_bulk_delete.result()
        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvcs_to_delete:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        logging.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300)
        logging.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid,
                    pool_name=sc_obj.ceph_pool.name
                )
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid
                )
            assert ret, (
                f"Volume associated with PVC {pvc_name} still exists "
                f"in backend"
            )

        log.info("Fetching IO results from the pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            )
        log.info("Verified IO result on pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        log.info("Ceph cluster health is OK")
Ejemplo n.º 16
0
    def test_run_pgsql(self, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        log.info("Create resource file for pgbench workload")
        pg_trans = transactions
        timeout = pg_trans * 3
        pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML)
        pg_data['spec']['workload']['args']['transactions'] = pg_trans
        pg_obj = OCS(**pg_data)
        pg_obj.create()

        # Wait for pgbench pod to be created
        for pgbench_pod in TimeoutSampler(
            pg_trans, 3, get_pod_name_by_pattern,
            'pgbench', 'my-ripsaw'
        ):
            try:
                if pgbench_pod[0] is not None:
                    pgbench_client_pod = pgbench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Respin Ceph pod
        resource_osd = [f'{pod_name}']
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        for resource in resource_osd:
            disruption.set_resource(resource=resource)
            disruption.delete_resource()

        # Wait for pg_bench pod to initialized and complete
        log.info("Waiting for pgbench_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=pgbench_client_pod,
            timeout=timeout,
            sleep=10,
        )

        # Running pgbench and parsing logs
        output = run_cmd(f'oc logs {pgbench_client_pod}')
        pg_output = utils.parse_pgsql_logs(output)
        log.info(
            "*******PGBench output log*********\n"
            f"{pg_output}"
        )
        for data in pg_output:
            latency_avg = data['latency_avg']
            if not latency_avg:
                raise UnexpectedBehaviour(
                    "PGBench failed to run, no data found on latency_avg"
                )
        log.info("PGBench has completed successfully")

        # Collect data and export to Google doc spreadsheet
        g_sheet = GoogleSpreadSheetAPI(sheet_name="OCS PGSQL", sheet_index=2)
        for lat in pg_output:
            lat_avg = lat['latency_avg']
            lat_stddev = lat['lat_stddev']
            tps_incl = lat['tps_incl']
            tps_excl = lat['tps_excl']
            g_sheet.insert_row(
                [int(lat_avg),
                 int(lat_stddev),
                 int(tps_incl),
                 int(tps_excl)], 2
            )
        # Clean up pgbench benchmark
        log.info("Deleting PG bench benchmark")
        pg_obj.delete()
Ejemplo n.º 17
0
    def test_resource_deletion_during_pvc_expansion(self, resource_to_delete):
        """
        Verify PVC expansion will succeed when rook-ceph, csi pods are re-spun
        during expansion

        """
        pvc_size_expanded = 30
        executor = ThreadPoolExecutor(max_workers=len(self.pvcs))
        disruption_ops = disruption_helpers.Disruptions()

        # Run IO to fill some data
        log.info(
            "Running IO on all pods to fill some data before PVC expansion."
        )
        for pod_obj in self.pods:
            storage_type = (
                'block' if pod_obj.pvc.volume_mode == 'Block' else 'fs'
            )
            pod_obj.run_io(
                storage_type=storage_type, size='4G', io_direction='write',
                runtime=30, rate='10M', fio_filename=f'{pod_obj.name}_f1'
            )

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"IO error on pod {pod_obj.name}. "
                f"FIO result: {fio_result}"
            )
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods before PVC expansion.")

        # Select the pod to be deleted
        disruption_ops.set_resource(resource=resource_to_delete)

        log.info("Expanding all PVCs.")
        for pvc_obj in self.pvcs:
            log.info(
                f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G"
            )
            pvc_obj.expand_proc = executor.submit(
                pvc_obj.resize_pvc, pvc_size_expanded, True
            )

        # Delete the pod 'resource_to_delete'
        disruption_ops.delete_resource()

        # Verify pvc expand status
        for pvc_obj in self.pvcs:
            assert pvc_obj.expand_proc.result(), (
                f"Expansion failed for PVC {pvc_obj.name}"
            )
        log.info("PVC expansion was successful on all PVCs")

        # Run IO to fill more data
        log.info("Write more data after PVC expansion.")
        for pod_obj in self.pods:
            storage_type = (
                'block' if pod_obj.pvc.volume_mode == 'Block' else 'fs'
            )
            pod_obj.run_io(
                storage_type=storage_type, size='10G', io_direction='write',
                runtime=30, rate='10M', fio_filename=f'{pod_obj.name}_f2'
            )

        log.info("Wait for IO to complete on all pods")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"IO error on pod {pod_obj.name}. "
                f"FIO result: {fio_result}"
            )
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods after PVC expansion.")
    def operations_base(self, resource_to_delete):
        """
        Delete resource 'resource_to_delete' while PVCs creation, Pods
        creation and IO operation are progressing.
        Verifies PVCs can be re-used by creating new pods.

        Steps:
        1. Create pods for running IO and verify they are Running.
        2. Start creating more pods.
        3. Start creating new PVCs.
        4. Start IO on pods created in Step 1.
        5. Delete the resource 'resource_to_delete'.
        6. Verify that PVCs created in Step 3 are in Bound state.
        7. Verify that pods created in Step 2 are Running.
        8. Verify IO results.
        9. Delete pods created in Steps 1 and 2.
        10. Verify the total number of 'resource_to_delete' pods.
        11. Verify volumes are unmapped from nodes after deleting pods.
        12. Use all PVCs to create new pods. One PVC for one pod.
        13. Start IO on all pods created in Step 10.
        14. Verify IO results.
        """
        # Separate the available PVCs
        pvc_objs_for_io_pods = self.pvc_objs[0:self.pvc_num_for_io_pods]
        pvc_objs_new_pods = self.pvc_objs[self.pvc_num_for_io_pods:]

        pod_functions = {
            'mds': get_mds_pods,
            'mon': get_mon_pods,
            'mgr': get_mgr_pods,
            'osd': get_osd_pods
        }

        executor = ThreadPoolExecutor(max_workers=2)

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)

        # Get number of pods
        initial_pods_num = len(pod_functions[resource_to_delete]())

        # Create pods for running IO
        io_pods = helpers.create_pods(pvc_objs_list=pvc_objs_for_io_pods,
                                      interface_type=self.interface,
                                      desired_status=constants.STATUS_RUNNING,
                                      wait=True,
                                      namespace=self.namespace)

        # Updating self.pod_objs for the purpose of teardown
        self.pod_objs.extend(io_pods)

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in io_pods:
            pod_obj.workload_setup(storage_type='fs')
        log.info("Setup for running IO is completed on pods")

        # Start creating new pods
        log.info("Start creating new pods.")
        bulk_pod_create = executor.submit(helpers.create_pods,
                                          pvc_objs_list=pvc_objs_new_pods,
                                          interface_type=self.interface,
                                          wait=False,
                                          namespace=self.namespace)

        # Start creation of new PVCs
        log.info("Start creating new PVCs.")
        bulk_pvc_create = executor.submit(helpers.create_multiple_pvcs,
                                          sc_name=self.sc_obj.name,
                                          namespace=self.namespace,
                                          number_of_pvc=self.num_of_new_pvcs,
                                          size=self.pvc_size,
                                          wait=False)

        # Start IO on each pod
        log.info("Start IO on pods")
        for pod_obj in io_pods:
            pod_obj.run_io(storage_type='fs', size=f'{self.pvc_size_int - 1}G')
        log.info("IO started on all pods.")

        # Delete the resource
        disruption.delete_resource()

        # Getting result of PVC creation as list of PVC objects
        pvc_objs_new = bulk_pvc_create.result()

        # Updating self.pvc_objs_new for the purpose of teardown
        self.pvc_objs_new.extend(pvc_objs_new)

        # Verify PVCs are Bound
        for pvc_obj in pvc_objs_new:
            assert pvc_obj.ocp.wait_for_resource(
                condition=constants.STATUS_BOUND,
                resource_name=pvc_obj.name,
                timeout=240,
                sleep=10
            ), (f"Wait timeout: PVC {pvc_obj.name} is not in 'Bound' status")
        log.info("Verified: New PVCs are Bound.")

        # Getting result of pods creation as list of Pod objects
        pod_objs_new = bulk_pod_create.result()

        # Updating self.pod_objs for the purpose of teardown
        self.pod_objs.extend(pod_objs_new)

        # Verify new pods are Running
        for pod_obj in pod_objs_new:
            assert pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=240,
                sleep=10), (
                    f"Wait timeout: Pod {pod_obj.name} is not in 'Running' "
                    f"state even after 120 seconds.")
        log.info("Verified: All pods are Running.")

        # Verify IO
        log.info("Fetching IO results.")
        for pod_obj in io_pods:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods.")

        all_pod_objs = io_pods + pod_objs_new

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod in all_pod_objs:
            pod_info = pod.get()
            node = pod_info['spec']['nodeName']
            pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']
            for pvc_obj in self.pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Delete pods
        for pod_obj in all_pod_objs:
            pod_obj.delete(wait=False)

        # Verify pods are deleted
        for pod_obj in all_pod_objs:
            pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name)

        # Updating self.pod_objs for the purpose of teardown
        self.pod_objs.clear()

        # Verify number of 'resource_to_delete' type pods
        final_pods_num = len(pod_functions[resource_to_delete]())
        assert final_pods_num == initial_pods_num, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{initial_pods_num}. Total number of pods present now: "
            f"{final_pods_num}")

        # Verify volumes are unmapped from nodes after deleting the pods
        for node, pvs in node_pv_dict.items():
            cmd = f'oc debug nodes/{node} -- df'
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")

        # Verify that PVCs are reusable by creating new pods
        all_pvc_objs = self.pvc_objs + pvc_objs_new
        pod_objs_re = helpers.create_pods(
            pvc_objs_list=all_pvc_objs,
            interface_type=self.interface,
            desired_status=constants.STATUS_RUNNING,
            wait=True,
            namespace=self.namespace)
        log.info("Successfully created new pods using all PVCs.")

        # Updating self.pod_objs for the purpose of teardown
        self.pod_objs.extend(pod_objs_re)

        # Run IO on each of the newly created pods
        for pod_obj in pod_objs_re:
            pod_obj.run_io(storage_type='fs',
                           size='100M',
                           runtime=10,
                           fio_filename='fio-file-retest')

        log.info("Fetching IO results from newly created pods")
        for pod_obj in pod_objs_re:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on newly created pods.")
Ejemplo n.º 19
0
    def test_ceph_daemon_kill_during_resource_creation(
        self, interface, operation_to_disrupt, resource_to_delete,
        multi_pvc_factory, pod_factory
    ):
        """
        Base function for ceph daemon kill disruptive tests.
        Deletion of 'resource_to_delete' daemon will be introduced while
        'operation_to_disrupt' is progressing.
        """
        disruption = disruption_helpers.Disruptions()
        pod_functions = {
            'mds': partial(pod.get_mds_pods), 'mon': partial(pod.get_mon_pods),
            'mgr': partial(pod.get_mgr_pods), 'osd': partial(pod.get_osd_pods),
            'rbdplugin': partial(pod.get_plugin_pods, interface=interface),
            'cephfsplugin': partial(pod.get_plugin_pods, interface=interface),
            'cephfsplugin_provisioner': partial(
                pod.get_cephfsplugin_provisioner_pods
            ),
            'rbdplugin_provisioner': partial(
                pod.get_rbdfsplugin_provisioner_pods
            ),
            'operator': partial(pod.get_operator_pods)
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        num_of_pvc = 12
        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=namespace)['items']
        )

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        disruption.set_resource(resource=resource_to_delete)
        disruption.select_daemon()

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend(
                [
                    f'{constants.ACCESS_MODE_RWO}-Block',
                    f'{constants.ACCESS_MODE_RWX}-Block'
                ]
            )

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory, interface=interface,
            project=self.proj_obj, size=8,
            access_modes=access_modes,
            access_modes_selection='distribute_random',
            status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc,
            wait_each=False
        )

        if operation_to_disrupt == 'create_pvc':
            # Ensure PVCs are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                get_all_pvcs, initial_num_of_pvc, namespace, 'increase'
            )
            assert ret, "Wait timeout: PVCs are not being created."
            log.info("PVCs creation has started.")
            disruption.kill_daemon()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(
                resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120
            )
            pvc_obj.reload()
        log.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(
            helpers.create_pods, pvc_objs, pod_factory, interface, 2
        )

        if operation_to_disrupt == 'create_pod':
            # Ensure that pods are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                pod.get_all_pods, initial_num_of_pods, namespace, 'increase'
            )
            assert ret, "Wait timeout: Pods are not being created."
            log.info(f"Pods creation has started.")
            disruption.kill_daemon()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180
            )
            pod_obj.reload()
        log.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            for sample in TimeoutSampler(
                180, 2, getattr, pod_obj, 'wl_setup_done'
            ):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod "
                        f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(
                storage_type=storage_type, size='2G', runtime=30,
                fio_filename=f'{pod_obj.name}_io_file1'
            )
        log.info("FIO started on all pods.")

        if operation_to_disrupt == 'run_io':
            disruption.kill_daemon()

        log.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            )
        log.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2)

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING
            )
            pod_obj.reload()
        log.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(
                storage_type=storage_type, size='1G', runtime=10,
                fio_filename=f'{pod_obj.name}_io_file2'
            )

        log.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            )
        log.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        log.info("Ceph cluster health is OK")