Example #1
0
    def stop(self, node, timeout):
        """
        Stop the kubelet service using parent service class. After that, ensures the corresponding OCP node
        moves to NotReady state.

        Args:
            node (object): Node objects
            timeout (int): time in seconds to wait for service to stop.

        """
        super().stop(node, timeout)
        wait_for_nodes_status(node_names=[node.name],
                              status=constants.NODE_NOT_READY,
                              timeout=timeout)
Example #2
0
    def test_registry_shutdown_and_recovery_node(self, nodes):
        """
        Test registry workload when backed by OCS and
        its impact when node is shutdown and recovered

        """

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Get the node list
        node_list = get_nodes(node_type="worker")

        for node in node_list:

            # Stop node
            nodes.stop_nodes(nodes=[node])

            # Validate node reached NotReady state
            wait_for_nodes_status(
                node_names=[node.name], status=constants.NODE_NOT_READY
            )

            # Start node
            nodes.start_nodes(nodes=[node])

            # Validate all nodes are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists()
        def finalizer():

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")
Example #4
0
    def restart(self, node, timeout):
        """
        Restart the kubelet service using parent service class. After that, ensures the corresponding OCP node
        is connectable and moves to Ready state.

        Args:
            node (object): Node objects
            timeout (int): time in seconds to wait for service to stop.

        """
        super().restart(node, timeout)
        wait_for_cluster_connectivity(tries=900)
        wait_for_nodes_status(node_names=[node.name],
                              status=constants.NODE_READY,
                              timeout=timeout)
    def restart_ocs_operator_node(self):
        """
        Restart node that runs OCS operator pod
        """

        pod_obj = pod.get_ocs_operator_pod()
        node_obj = pod.get_pod_node(pod_obj)

        self.nodes.restart_nodes([node_obj])

        wait_for_nodes_status()

        pod.wait_for_pods_to_be_running(
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name]
        )
Example #6
0
    def teardown():

        # Delete created app pods and pvcs
        assert pod.delete_pods(pod_objs)
        assert pvc.delete_pvcs(pvc_objs)

        # Switch to default project
        ret = ocp.switch_to_default_rook_cluster_project()
        assert ret, 'Failed to switch to default rook cluster project'

        # Delete created projects
        for prj in namespace_list:
            prj.delete(resource_name=prj.namespace)

        # Validate all nodes are in READY state
        wait_for_nodes_status()
Example #7
0
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        wait_for_nodes_status()

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-mgr',
                                         timeout=600)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-mon',
                                         resource_count=3,
                                         timeout=600)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-osd',
                                         resource_count=3,
                                         timeout=600)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Example #8
0
        def finalizer():
            """
            Removes huge pages on worker nodes and verifies all pods are up

            """
            disable_huge_pages()

            wait_for_nodes_status(status=constants.NODE_READY, timeout=600)

            nodes = get_nodes()
            for node in nodes:
                assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] ==
                        "0"), f"Huge pages is not applied on {node.name}"

            log.info("Wait for all pods to be in running state")
            wait_for_pods_to_be_running(timeout=600)
            sanity_helpers.ceph_health_check(tries=120)
Example #9
0
def wait_for_nodes_status_and_prometheus_health_check(pods):
    """
    Waits for the all the nodes to be in running state
    and also check prometheus health

    """

    # Validate all nodes are in READY state
    wait_for_nodes_status(timeout=900)

    # Check for the created pvc metrics after rebooting the master nodes
    for pod_obj in pods:
        assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
            f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        )

    assert prometheus_health_check(), "Prometheus health is degraded"
Example #10
0
    def start_baremetal_machines(self, baremetal_machine, wait=True):
        """
        Start Baremetal Machines

        Args:
            baremetal_machine (list): BM objects
            wait (bool): Wait for BMs to start

        """
        for node in baremetal_machine:
            if self.mgmt_details[node.name]:
                ipmi_ctx = self.get_ipmi_ctx(
                    host=self.mgmt_details[node.name]["mgmt_console"],
                    user=self.mgmt_details[node.name]["mgmt_username"],
                    password=self.mgmt_details[node.name]["mgmt_password"],
                )
                logger.info(f"Powering On {node.name}")
                ipmi_ctx.chassis_control_power_up()
            if wait:
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    for status in TimeoutSampler(
                        600, 5, self.get_power_status, ipmi_ctx
                    ):
                        logger.info(
                            f"Waiting for Baremetal Machine {node.name} to power on. "
                            f"Current Baremetal status: {status}"
                        )
                        if status == VM_POWERED_ON:
                            logger.info(
                                f"Baremetal Machine {node.name} reached poweredOn status"
                            )
                            ipmi_ctx.session.close()
                            break

        wait_for_cluster_connectivity(tries=400)
        wait_for_nodes_status(
            node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800
        )
        wait_for_nodes_status(
            node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800
        )
Example #11
0
    def stop_node():
        """
        Turn off one worker node for 6 minutes.

        Returns:
            str: Node that was turned down

        """
        # run_time of operation
        run_time = 60 * 6
        nonlocal node
        logger.info(f"Turning off node {node.name}")
        nodes.stop_nodes(nodes=[node])
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=[node.name], status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node.name
Example #12
0
    def test_osd_node_restart_and_check_osd_pods_status(self, nodes):
        """
        1) Restart one of the osd nodes.
        2) Check that the osd pods associated with the node should change to a Terminating state.
        3) Wait for the node to reach Ready state.
        4) Check that the new osd pods with the same ids start on the same node.
        5) Check the worker nodes security groups.
        """
        # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162
        if is_ms_consumer_cluster():
            logger.info(
                "The test is applicable only for an MS provider cluster. "
                "Switching to the provider cluster...")
            config.switch_to_provider()

        self.create_resources()

        osd_node_name = random.choice(get_osd_running_nodes())
        osd_node = get_node_objs([osd_node_name])[0]

        old_osd_pod_ids = get_node_osd_ids(osd_node_name)
        logger.info(f"osd pod ids: {old_osd_pod_ids}")
        node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids)
        node_osd_pod_names = [p.name for p in node_osd_pods]

        logger.info(f"Going to restart the node {osd_node_name}")
        nodes.restart_nodes(nodes=[osd_node], wait=False)

        logger.info("Verify the node osd pods go into a Terminating state")
        res = pod.wait_for_pods_to_be_in_statuses(
            [constants.STATUS_TERMINATING], node_osd_pod_names)
        assert res, "Not all the node osd pods are in a Terminating state"

        wait_for_nodes_status(node_names=[osd_node_name])
        assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                                old_osd_pod_ids,
                                                timeout=300)
        logger.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )

        logger.info(
            "Verify the worker nodes security groups on the provider...")
        assert verify_worker_nodes_security_groups()
Example #13
0
    def stop_nodes():
        """
        Turn off test nodes for 5 minutes.

        Returns:
            list: Names of nodes that were turned down

        """
        # run_time of operation
        run_time = 60 * 5
        nonlocal test_nodes
        node_names = [node.name for node in test_nodes]
        logger.info(f"Turning off nodes {node_names}")
        nodes.stop_nodes(nodes=test_nodes)
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node_names
Example #14
0
    def test_monitoring_shutdown_and_recovery_prometheus_node(
            self, nodes, pods):
        """
        Test case to validate whether shutdown and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get all prometheus pods
        prometheus_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for prometheus_pod_obj in prometheus_pod_obj_list:
            # Get the node where the prometheus pod is hosted
            prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj)

            # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted
            nodes.stop_nodes([prometheus_node_obj])

            waiting_time = 20
            log.info(f"Waiting for {waiting_time} seconds")
            time.sleep(waiting_time)

            nodes.start_nodes([prometheus_node_obj])

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check all the prometheus pods are up
        for pod_obj in prometheus_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after shutdown and recovery of prometheus nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
Example #16
0
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])
Example #17
0
    def test_hugepages_post_odf_deployment(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_restart_teardown,
    ):
        """
        Test to verify that after enabling huge pages the nodes come up with
        higher page size and all odf cluster pods come back up.

        """
        # Applies huge pages on the cluster nodes
        enable_huge_pages()

        log.info("Wait for all worker node to be READY state")
        wait_for_nodes_status(status=constants.NODE_READY, timeout=600)

        nodes = get_nodes()
        for node in nodes:
            assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] ==
                    "64Mi"), f"Huge pages is not applied on {node.name}"

        log.info("Wait for all storage cluster pods to be in running state")
        wait_for_pods_to_be_running(timeout=600)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory, False)

        # Deleting Resources
        log.info("Deleting the resources created")
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
        def finalizer():
            # Start the powered off nodes
            nodes.restart_nodes_teardown()
            try:
                node.wait_for_nodes_status(status=constants.NODE_READY)
            except ResourceWrongStatusException:
                # Restart the nodes if in NotReady state
                not_ready_nodes = [
                    n for n in node.get_node_objs() if n
                    .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY
                ]
                if not_ready_nodes:
                    logger.info(
                        f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
                    )
                    nodes.restart_nodes(not_ready_nodes)
                    node.wait_for_nodes_status(status=constants.NODE_READY)

            # Check ceph health
            assert ceph_health_check(), f"Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")
Example #19
0
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, nodes, pods):
        """
        Test case to validate when the prometheus pod is down and its
        interaction with prometheus

        """

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the node where the prometheus pod is hosted
            pod_node_obj = pod.get_pod_node(pod_obj)

            # Make one of the node down where the prometheus pod is hosted
            nodes.restart_nodes([pod_node_obj])

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the prometheus pods are up
        for pod_obj in pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check for the created pvc metrics after restarting node where prometheus pod is hosted
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
            log.info(
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected"
            )
Example #20
0
    def test_node_maintenance_restart_activate(
        self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type
    ):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node's ec2 instance
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node
        typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_node, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_node[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        node.drain_nodes([typed_node_name])

        instance = aws.get_instances_ids_and_names(typed_node)
        assert instance, f"Failed to get ec2 instances for node {typed_node_name}"

        # Restarting ec2 instance
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        node.wait_for_nodes_status(
            node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED
        )
        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
Example #21
0
    def test_add_node(self):
        """
        Test for adding worker nodes to the cluster while IOs
        """
        dt = config.ENV_DATA['deployment_type']
        if dt == 'ipi':
            before_replica_counts = dict()
            count = 2
            machines = machine_utils.get_machinesets()
            for machine in machines:
                before_replica_counts.update(
                    {machine: machine_utils.get_replica_count(machine)})
            worker_nodes_before = helpers.get_worker_nodes()
            logger.info(
                f'The worker nodes number before adding a new node is {len(worker_nodes_before)}'
            )
            after_replica_counts = dict()
            for machine in machines:
                machine_utils.add_node(machine, count=count)
                after_replica_counts.update(({
                    machine:
                    machine_utils.get_replica_count(machine)
                }))
            logger.info(after_replica_counts)
            for sample in TimeoutSampler(timeout=300,
                                         sleep=3,
                                         func=helpers.get_worker_nodes):
                if len(sample) == count * len(machines):
                    break

            worker_nodes_after = helpers.get_worker_nodes()
            logger.info(
                f'The worker nodes number after adding a new node is {len(worker_nodes_after)}'
            )
            wait_for_nodes_status(node_names=worker_nodes_after,
                                  status=constants.NODE_READY)
        else:
            pytest.skip("UPI not yet supported")
Example #22
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=False)

        wait_for_nodes_status(
            node_names=[typed_node_name],
            status=constants.NODE_NOT_READY_SCHEDULING_DISABLED)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type="master",
                                                   print_table=True)

        if pod_name_of_node == "couchbase":
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == "osd":
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == "master":
            master_node = get_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == "master":
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check(tries=40)
Example #24
0
    def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup):
        """
        Test case to validate rebooting master node shouldn't effect
        amq workloads running in background

        """
        # Get all amq pods
        pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE)

        # Get the node list
        node = get_nodes(node_type, num_of_nodes=1)

        # Reboot one master nodes
        nodes.restart_nodes(node, wait=False)

        # Wait some time after rebooting master
        waiting_time = 40
        log.info(f"Waiting {waiting_time} seconds...")
        time.sleep(waiting_time)

        # Validate all nodes and services are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all amq pods are up and running
        assert POD.wait_for_resource(condition="Running",
                                     resource_count=len(pod_obj_list),
                                     timeout=300)

        # Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)
    def test_amq_after_shutdown_and_recovery_worker_node(
            self, nodes, amq_setup):
        """
        Test case to validate shutdown and recovery node
        shouldn't effect amq workloads running in background

        """
        # Get all amq pods
        pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE)

        # Get the node list
        node = get_typed_nodes(node_type='worker', num_of_nodes=1)

        # Reboot one master nodes
        nodes.stop_nodes(nodes=node)

        waiting_time = 20
        log.info(f"Waiting for {waiting_time} seconds")
        time.sleep(waiting_time)

        nodes.start_nodes(nodes=node)

        # Validate all nodes are in READY state and up
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=30,
              delay=15)(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all amq pods are up and running
        assert POD.wait_for_resource(condition='Running',
                                     resource_count=len(pod_obj_list),
                                     timeout=300)

        # Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)
Example #26
0
    def add_nodes(self):
        """
        Add new nodes to the cluster
        """
        # create separate directory for scale-up terraform data
        scaleup_terraform_data_dir = os.path.join(
            self.cluster_path,
            constants.TERRAFORM_DATA_DIR,
            constants.SCALEUP_TERRAFORM_DATA_DIR
        )
        create_directory_path(scaleup_terraform_data_dir)
        logger.info(
            f"scale-up terraform data directory: {scaleup_terraform_data_dir}"
        )

        # git clone repo from openshift-misc
        clone_repo(
            constants.VSPHERE_SCALEUP_REPO, self.upi_scale_up_repo_path
        )

        # modify scale-up repo
        self.modify_scaleup_repo()

        config.ENV_DATA['vsphere_resource_pool'] = config.ENV_DATA.get(
            "cluster_name"
        )

        # sync guest time with host
        if config.ENV_DATA.get('sync_time_with_host'):
            sync_time_with_host(constants.SCALEUP_VSPHERE_MACHINE_CONF, True)

        # get the RHCOS worker list
        self.rhcos_ips = get_node_ips()
        logger.info(f"RHCOS IP's: {json.dumps(self.rhcos_ips)}")

        # generate terraform variable for scaling nodes
        self.generate_terraform_vars_for_scaleup()

        # Add nodes using terraform
        scaleup_terraform = Terraform(constants.SCALEUP_VSPHERE_DIR)
        previous_dir = os.getcwd()
        os.chdir(scaleup_terraform_data_dir)
        scaleup_terraform.initialize()
        scaleup_terraform.apply(self.scale_up_terraform_var)
        scaleup_terraform_tfstate = os.path.join(
            scaleup_terraform_data_dir,
            "terraform.tfstate"
        )
        out = scaleup_terraform.output(
            scaleup_terraform_tfstate,
            "rhel_worker"
        )
        rhel_worker_nodes = json.loads(out)['value']
        logger.info(f"RHEL worker nodes: {rhel_worker_nodes}")
        os.chdir(previous_dir)

        # Install OCP on rhel nodes
        rhel_install = OCPINSTALLRHEL(rhel_worker_nodes)
        rhel_install.prepare_rhel_nodes()
        rhel_install.execute_ansible_playbook()

        # Giving some time to settle down the new nodes
        time.sleep(self.wait_time)

        # wait for nodes to be in READY state
        wait_for_nodes_status(timeout=300)
Example #27
0
    def test_worker_node_restart_during_pvc_expansion(self, nodes):
        """
        Verify PVC expansion will succeed if a worker node is restarted
        during expansion

        """
        pvc_size_expanded = 30
        executor = ThreadPoolExecutor(max_workers=len(self.pods))
        selected_node = node.get_nodes(
            node_type=constants.WORKER_MACHINE, num_of_nodes=1
        )

        # Restart node
        log.info(f"Restart node {selected_node[0].name}")
        restart_thread = executor.submit(nodes.restart_nodes, nodes=selected_node)

        log.info("Expanding all PVCs.")
        for pvc_obj in self.pvcs:
            log.info(f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G")
            pvc_obj.expand_proc = executor.submit(
                pvc_obj.resize_pvc, pvc_size_expanded, True
            )

        # Check result of node 'restart_nodes'
        restart_thread.result()

        log.info("Verify status of node.")
        node.wait_for_nodes_status(
            node_names=[node.get_node_name(selected_node[0])],
            status=constants.NODE_READY,
            timeout=300,
        )

        # Verify pvc expansion status
        for pvc_obj in self.pvcs:
            assert (
                pvc_obj.expand_proc.result()
            ), f"Expansion failed for PVC {pvc_obj.name}"
        log.info("PVC expansion was successful on all PVCs")

        # Run IO
        log.info("Run IO after PVC expansion.")
        for pod_obj in self.pods:
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
            pod_obj.io_proc = executor.submit(
                pod_obj.run_io,
                storage_type=storage_type,
                size="6G",
                runtime=30,
                fio_filename=f"{pod_obj.name}_file",
            )

        log.info("Wait for IO to complete on all pods")
        for pod_obj in self.pods:
            pod_obj.io_proc.result()
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert err_count == 0, (
                f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}"
            )
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods after PVC expansion.")
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Example #29
0
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, test_fixture):
        """
        Test case to validate when the prometheus pod is down and
        interaction with prometheus
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        aws_obj = aws.AWS()

        # Get all the openshift-monitoring pods
        monitoring_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE)

        # Get the worker node list
        workers = get_typed_nodes(node_type='worker')

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            prometheus_node = [
                node for node in workers
                if node.get().get('metadata').get('name') == prometheus_node
            ]

            # Make one of the node down where the prometheus pod is hosted
            instances = aws.get_instances_ids_and_names(prometheus_node)
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the monitoring pods are up
        for pod_obj in monitoring_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING)

        # Check for the created pvc metrics after nodes restarting
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after restarting nodes
        namespaces = helpers.create_multilpe_projects(number_of_project=1)
        namespace_list.extend(namespaces)

        # Create pvcs after restarting nodes
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after restarting nodes
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod after restarting nodes
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )
Example #30
0
    def add_nodes(self):
        """
        Add new nodes to the cluster
        """
        # create separate directory for scale-up terraform data
        scaleup_terraform_data_dir = os.path.join(
            self.cluster_path,
            constants.TERRAFORM_DATA_DIR,
            constants.SCALEUP_TERRAFORM_DATA_DIR,
        )
        create_directory_path(scaleup_terraform_data_dir)
        logger.info(
            f"scale-up terraform data directory: {scaleup_terraform_data_dir}")

        # git clone repo from openshift-misc
        clone_repo(constants.VSPHERE_SCALEUP_REPO, self.upi_scale_up_repo_path)

        # git clone repo from cluster-launcher
        clone_repo(constants.VSPHERE_CLUSTER_LAUNCHER,
                   self.cluster_launcer_repo_path)

        helpers = VSPHEREHELPERS()
        helpers.modify_scaleup_repo()

        config.ENV_DATA["vsphere_resource_pool"] = config.ENV_DATA.get(
            "cluster_name")

        # sync guest time with host
        sync_time_with_host_file = constants.SCALEUP_VSPHERE_MACHINE_CONF
        if config.ENV_DATA["folder_structure"]:
            sync_time_with_host_file = os.path.join(
                constants.CLUSTER_LAUNCHER_VSPHERE_DIR,
                f"aos-{get_ocp_version(seperator='_')}",
                constants.CLUSTER_LAUNCHER_MACHINE_CONF,
            )
        if config.ENV_DATA.get("sync_time_with_host"):
            sync_time_with_host(sync_time_with_host_file, True)

        # get the RHCOS worker list
        rhcos_ips = get_node_ips()
        logger.info(f"RHCOS IP's: {json.dumps(rhcos_ips)}")

        # generate terraform variable for scaling nodes
        self.scale_up_terraform_var = helpers.generate_terraform_vars_for_scaleup(
            rhcos_ips)

        # choose the vsphere_dir based on OCP version
        # generate cluster_info and config yaml files
        # for OCP version greater than 4.4
        vsphere_dir = constants.SCALEUP_VSPHERE_DIR
        rhel_module = "rhel-worker"
        if Version.coerce(self.ocp_version) >= Version.coerce("4.5"):
            vsphere_dir = os.path.join(
                constants.CLUSTER_LAUNCHER_VSPHERE_DIR,
                f"aos-{get_ocp_version('_')}",
                "vsphere",
            )
            helpers.generate_cluster_info()
            helpers.generate_config_yaml()
            rhel_module = "RHEL_WORKER_LIST"

        # Add nodes using terraform
        scaleup_terraform = Terraform(vsphere_dir)
        previous_dir = os.getcwd()
        os.chdir(scaleup_terraform_data_dir)
        scaleup_terraform.initialize()
        scaleup_terraform.apply(self.scale_up_terraform_var)
        scaleup_terraform_tfstate = os.path.join(scaleup_terraform_data_dir,
                                                 "terraform.tfstate")
        out = scaleup_terraform.output(scaleup_terraform_tfstate, rhel_module)
        if config.ENV_DATA["folder_structure"]:
            rhel_worker_nodes = out.strip().replace('"', "").split(",")
        else:
            rhel_worker_nodes = json.loads(out)["value"]

        logger.info(f"RHEL worker nodes: {rhel_worker_nodes}")
        os.chdir(previous_dir)

        # Install OCP on rhel nodes
        rhel_install = OCPINSTALLRHEL(rhel_worker_nodes)
        rhel_install.prepare_rhel_nodes()
        rhel_install.execute_ansible_playbook()

        # Giving some time to settle down the new nodes
        time.sleep(self.wait_time)

        # wait for nodes to be in READY state
        wait_for_nodes_status(timeout=300)