Esempio n. 1
0
    def test_monitoring_after_respinning_ceph_pods(self, test_fixture):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ['mgr', 'mon', 'osd']
        disruption = disruption_helpers.Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics after respinning ceph pods
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after the respinning ceph pods
        namespaces = helpers.create_multilpe_projects(number_of_project=2)
        namespace_list.extend(namespaces)

        # Create pvcs after the respinning ceph pods
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after the respinning ceph pods
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )
    def test_monitoring_when_osd_down(self, pods):
        """
        Test case to validate monitoring when osd is down

        """

        # Get osd pods
        osd_pod_list = pod.get_osd_pods()

        # Make one of the osd down(first one)
        resource_name = osd_pod_list[0].get().get('metadata').get('name')
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=0)

        # Validate osd is down
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        pod_obj.wait_for_delete(resource_name=resource_name), (
            f"Resources is not deleted {resource_name}")

        # Check for the created pvc metrics when osd is down
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

        # Make osd up which was down
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=1)

        # Validate osd is up and ceph health is ok
        self.sanity_helpers.health_check()
    def pods(self, multi_pvc_factory, dc_pod_factory):
        """
        Prepare multiple dc pods for the test

        Returns:
            list: Pod instances

        """
        sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL)

        pvc_objs = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL,
                                     storageclass=sc,
                                     size=self.pvc_size,
                                     num_of_pvc=self.num_of_pvcs)

        pod_objs = []
        for pvc_obj in pvc_objs:
            pod_objs.append(dc_pod_factory(pvc=pvc_obj))

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
        return pod_objs
    def test_monitoring_after_rebooting_node_where_mgr_is_running(self):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod
        """

        aws_obj = aws.AWS()

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        instances = aws.get_instances_ids_and_names([mgr_node_obj])
        aws_obj.restart_ec2_instances(instances=instances,
                                      wait=True,
                                      force=True)

        # Validate all nodes are in READY state
        wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Esempio n. 5
0
def test_fixture(request, storageclass_factory):
    """
    Setup and teardown
    """
    def teardown():

        # Delete created app pods and pvcs
        assert pod.delete_pods(pod_objs)
        assert pvc.delete_pvcs(pvc_objs)

        # Switch to default project
        ret = ocp.switch_to_default_rook_cluster_project()
        assert ret, 'Failed to switch to default rook cluster project'

        # Delete created projects
        for prj in namespace_list:
            prj.delete(resource_name=prj.namespace)

        # Validate all nodes are in READY state
        wait_for_nodes_status()

    request.addfinalizer(teardown)

    # Create a storage class
    sc = storageclass_factory()

    # Create projects
    namespace_list = helpers.create_multilpe_projects(number_of_project=1)

    # Create pvcs
    pvc_objs = [
        helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace)
        for each_namespace in namespace_list
    ]
    for pvc_obj in pvc_objs:
        helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
        pvc_obj.reload()

    # Create app pods
    pod_objs = [
        helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                           pvc_name=each_pvc.name,
                           namespace=each_pvc.namespace)
        for each_pvc in pvc_objs
    ]
    for pod_obj in pod_objs:
        helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
        pod_obj.reload()

    # Check for the created pvc metrics on prometheus pod
    for pvc_obj in pvc_objs:
        assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
            f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
        )

    return namespace_list, pvc_objs, pod_objs, sc
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        retry((CommandFailed, ResourceWrongStatusException),
              tries=20,
              delay=15)(wait_for_nodes_status())

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition="Running",
                                         selector="app=rook-ceph-mgr",
                                         timeout=600)
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
Esempio n. 7
0
    def test_monitoring_after_rebooting_master_node(self, pod_factory):
        """
        Test case to validate reboot master node and its
        interaction with prometheus pods
        """
        aws_obj = aws.AWS()

        # Get the master node list
        master_nodes = get_typed_nodes(node_type='master')

        # Reboot one after one master nodes
        for node in master_nodes:
            instances = aws.get_instances_ids_and_names([node])
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_master_node_to_be_running_state()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

        pod_obj = pod_factory(interface=constants.CEPHBLOCKPOOL,
                              status=constants.STATUS_RUNNING)
        self.pod_objs.extend([pod_obj])

        # Check for the new created pvc metrics on prometheus pod
        assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
            f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        )
    def test_fixture(self, pod_factory, num_of_pod=2):
        """
        Create resources for the test
        """
        self.pod_objs = [
            pod_factory(interface=constants.CEPHBLOCKPOOL,
                        status=constants.STATUS_RUNNING)
            for _ in range(num_of_pod)
        ]

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Esempio n. 9
0
def wait_for_nodes_status_and_prometheus_health_check(pods):
    """
    Waits for the all the nodes to be in running state
    and also check prometheus health

    """

    # Validate all nodes are in READY state
    wait_for_nodes_status(timeout=900)

    # Check for the created pvc metrics after rebooting the master nodes
    for pod_obj in pods:
        assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
            f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        )

    assert prometheus_health_check(), "Prometheus health is degraded"
    def test_monitoring_shutdown_and_recovery_prometheus_node(
            self, nodes, pods):
        """
        Test case to validate whether shutdown and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get all prometheus pods
        prometheus_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for prometheus_pod_obj in prometheus_pod_obj_list:
            # Get the node where the prometheus pod is hosted
            prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj)

            # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted
            nodes.stop_nodes([prometheus_node_obj])

            waiting_time = 20
            log.info(f"Waiting for {waiting_time} seconds")
            time.sleep(waiting_time)

            nodes.start_nodes(nodes=[prometheus_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check all the prometheus pods are up
        for pod_obj in prometheus_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for the created pvc metrics after shutdown and recovery of prometheus nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ['mgr', 'mon', 'osd']
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, nodes, pods):
        """
        Test case to validate when the prometheus pod is down and its
        interaction with prometheus

        """

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the node where the prometheus pod is hosted
            pod_node_obj = pod.get_pod_node(pod_obj)

            # Make one of the node down where the prometheus pod is hosted
            nodes.restart_nodes([pod_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check all the prometheus pods are up
        for pod_obj in pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check for the created pvc metrics after restarting node where prometheus pod is hosted
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
            log.info(
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected"
            )
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ["mgr", "mon", "osd"]
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

        # Validate osd is up and ceph health is ok
        self.sanity_helpers.health_check(tries=40)
    def test_monitoring_after_restarting_prometheus_pod(self, pods):
        """
        Test case to validate prometheus pod restart
        should not have any functional impact

        """

        # Get the prometheus pod
        prometheus_pod_obj = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"]
        )

        for pod_object in prometheus_pod_obj:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_object.get()
            pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"
            ]

            # Restart the prometheus pod
            pod_object.delete(force=True)
            pod_obj = ocp.OCP(
                kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE
            )
            assert pod_obj.wait_for_resource(
                condition="Running", selector="app=prometheus", timeout=60
            )

            # Check the same pvc is mounted on new pod
            pod_info = pod_object.get()
            assert (
                pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
                in pvc_name
            ), f"Old pvc not found after restarting the prometheus pod {pod_object.name}"

        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Esempio n. 16
0
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, test_fixture):
        """
        Test case to validate when the prometheus pod is down and
        interaction with prometheus
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        aws_obj = aws.AWS()

        # Get all the openshift-monitoring pods
        monitoring_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE)

        # Get the worker node list
        workers = get_typed_nodes(node_type='worker')

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            prometheus_node = [
                node for node in workers
                if node.get().get('metadata').get('name') == prometheus_node
            ]

            # Make one of the node down where the prometheus pod is hosted
            instances = aws.get_instances_ids_and_names(prometheus_node)
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the monitoring pods are up
        for pod_obj in monitoring_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING)

        # Check for the created pvc metrics after nodes restarting
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after restarting nodes
        namespaces = helpers.create_multilpe_projects(number_of_project=1)
        namespace_list.extend(namespaces)

        # Create pvcs after restarting nodes
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after restarting nodes
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod after restarting nodes
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )