コード例 #1
0
def test_monitoring_before_ocp_upgrade():
    """
    Test monitoring before ocp upgrade

    """
    assert pre_upgrade_monitoring_pvc
    assert prometheus_health_check(), "Prometheus health is degraded"
コード例 #2
0
        def finalizer():

            # Validate all nodes are schedulable
            scheduling_disabled_nodes = [
                n.name for n in get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_READY_SCHEDULING_DISABLED
            ]
            if scheduling_disabled_nodes:
                schedule_nodes(scheduling_disabled_nodes)

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")

            assert prometheus_health_check(), "Prometheus health is degraded"
コード例 #3
0
    def test_monitoring_delete_pvc(self):
        """
        Test case to validate whether delete pvcs+configmap and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get 'cluster-monitoring-config' configmap
        ocp_configmap = ocp.OCP(
            namespace=constants.MONITORING_NAMESPACE, kind="configmap"
        )
        configmap_dict = ocp_configmap.get(resource_name="cluster-monitoring-config")
        dir_configmap = tempfile.mkdtemp(prefix="configmap_")
        yaml_file = f"{dir_configmap}/configmap.yaml"
        templating.dump_data_to_temp_yaml(configmap_dict, yaml_file)

        # Get prometheus and alertmanager pods
        prometheus_alertmanager_pods = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=["prometheus", "alertmanager"],
        )

        # Get all pvc on monitoring namespace
        pvc_objs_list = pvc.get_all_pvc_objs(namespace=constants.MONITORING_NAMESPACE)

        # Delete configmap
        ocp_configmap.delete(resource_name="cluster-monitoring-config")

        # Delete all pvcs on monitoring namespace
        pvc.delete_pvcs(pvc_objs=pvc_objs_list)

        # Check all the prometheus and alertmanager pods are up
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180
            )

        # Create configmap
        ocp_configmap.create(yaml_file=dir_configmap)

        # Check all the PVCs are up
        for pvc_obj in pvc_objs_list:
            wait_for_resource_state(
                resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180
            )

        # Check all the prometheus and alertmanager pods are up
        # and pvc are mounted on monitoring pods
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180
            )
            mount_point = pod_obj.exec_cmd_on_pod(
                command="df -kh",
                out_yaml_format=False,
            )
            assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}"
        log.info("Verified all pvc are mounted on monitoring pods")

        # Validate the prometheus health is ok
        assert prometheus_health_check(), "Prometheus cluster health is not OK"
コード例 #4
0
def wait_for_nodes_status_and_prometheus_health_check(pods):
    """
    Waits for the all the nodes to be in running state
    and also check prometheus health

    """

    # Validate all nodes are in READY state
    wait_for_nodes_status(timeout=900)

    # Check for the created pvc metrics after rebooting the master nodes
    for pod_obj in pods:
        assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
            f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        )

    assert prometheus_health_check(), "Prometheus health is degraded"
コード例 #5
0
def test_monitoring_after_ocp_upgrade(pre_upgrade_monitoring_pvc):
    """
    After ocp upgrade validate all monitoring pods are up and running,
    its health is OK and also confirm no new monitoring
    pvc created instead using previous one.

    """
    pod_obj_list = pod.get_all_pods(
        namespace=defaults.OCS_MONITORING_NAMESPACE)

    POD.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        resource_count=len(pod_obj_list),
        timeout=180,
    )
    post_upgrade_monitoring_pvc = get_list_pvc_objs_created_on_monitoring_pods(
    )

    assert len(pre_upgrade_monitoring_pvc) == len(
        post_upgrade_monitoring_pvc
    ), ("Before and after ocp upgrade pvc are not matching"
        f"pre_upgrade_monitoring_pvc are {[pvc_obj.name for pvc_obj in pre_upgrade_monitoring_pvc]}."
        f"post_upgrade_monitoring_pvc are {[pvc_obj.name for pvc_obj in post_upgrade_monitoring_pvc]}"
        )

    before_upgrade_pv_list = []
    after_upgrade_pv_list = []
    for before_upgrade_pvc_obj in pre_upgrade_monitoring_pvc:
        before_upgrade_pv_list.append(
            before_upgrade_pvc_obj.get().get("spec").get("volumeName"))

    for after_upgrade_pvc_obj in post_upgrade_monitoring_pvc:
        after_upgrade_pv_list.append(
            after_upgrade_pvc_obj.get().get("spec").get("volumeName"))
        assert after_upgrade_pvc_obj.get().get("status").get(
            "phase") == "Bound"

    assert set(before_upgrade_pv_list) == set(
        after_upgrade_pv_list
    ), "Before and after ocp upgrade pv list are not matching"
    assert prometheus_health_check(), "Prometheus health is degraded"
コード例 #6
0
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
        def finalizer():

            assert prometheus_health_check(), "Prometheus health is degraded"