def test_rolling_shutdown_and_recovery(self, nodes, pvc_factory,
                                           pod_factory, bucket_factory,
                                           rgw_bucket_factory):
        SECONDS_TO_WAIT = 180
        """
        Test rolling shutdown and recovery of OCS worker nodes

        """
        # Get OCS worker node objects
        ocs_node_objs = get_ocs_nodes()

        # Start rolling shutdown and recovery of OCS worker nodes
        log.info("ShutDown OCS worker")
        for node_obj in ocs_node_objs:
            nodes.stop_nodes(nodes=[node_obj])
            log.info(
                f"Keeping node in stopped state for {SECONDS_TO_WAIT} mins")
            time.sleep(SECONDS_TO_WAIT)
            nodes.start_nodes(nodes=[node_obj])
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            log.info("Checking storage pods status")
            # Validate storage pods are running
            wait_for_pods_to_be_running(timeout=600)

        # Check basic cluster functionality by creating some resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
    def test_add_capacity_node_restart(
        self,
        nodes,
        multi_pvc_factory,
        pod_factory,
        workload_storageutilization_rbd,
        num_of_nodes,
    ):
        """
        test add capacity when one of the worker nodes got restart in the middle of the process
        """
        logging.info(
            "Condition 1 to start the test is met: storageutilization is completed"
        )
        # Please notice: When the branch 'wip-add-capacity-e_e' will be merged into master
        # the test will include more much data both before, and after calling 'add_capacity'function.

        node_list = get_ocs_nodes(num_of_nodes=num_of_nodes)
        assert node_list, "Condition 2 to start test failed: No node to restart"

        max_osds = 15
        osd_pods_before = pod_helpers.get_osd_pods()
        assert (
            len(osd_pods_before) < max_osds
        ), "Condition 3 to start test failed: We have maximum of osd's in the cluster"
        logging.info("All start conditions are met!")

        osd_size = storage_cluster.get_osd_size()
        logging.info("Calling add_capacity function...")
        result = storage_cluster.add_capacity(osd_size)
        if result:
            logging.info("add capacity finished successfully")
        else:
            logging.info("add capacity failed")

        # Restart nodes while additional storage is being added
        logging.info("Restart nodes:")
        logging.info([n.name for n in node_list])
        nodes.restart_nodes(nodes=node_list, wait=True)
        logging.info("Finished restarting the node list")

        # The exit criteria verification conditions here are not complete. When the branch
        # 'wip-add-capacity-e_e' will be merged into master I will use the functions from this branch.

        pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])
        pod.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-osd",
            resource_count=result * 3,
        )

        # Verify OSDs are encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()

        logging.info("Finished verifying add capacity osd storage with node restart")
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=180)
Beispiel #3
0
def check_node_replacement_verification_steps(old_node_name, new_node_name,
                                              old_osd_node_names, old_osd_ids):
    """
    Check if the node replacement verification steps finished successfully.

    Args:
        old_node_name (str): The name of the old node that has been deleted
        new_node_name (str): The name of the new node that has been created
        old_osd_node_names (list): The name of the new node that has been added to osd nodes
        old_osd_ids (list): List of the old osd ids

    Raises:
        AssertionError: If the node replacement verification steps failed.

    """
    min_osd_nodes = 3
    num_of_old_osd_nodes = len(old_osd_node_names)
    ocs_nodes = node.get_ocs_nodes()
    num_of_old_ocs_nodes = len(ocs_nodes)

    if num_of_old_osd_nodes <= min_osd_nodes:
        log.info(
            f"We have {num_of_old_osd_nodes} osd nodes in the cluster - which is the minimum number "
            f"of osd nodes. Wait for the new created worker node to appear in the osd nodes"
        )
        timeout = 1500
        new_osd_node_name = node.wait_for_new_osd_node(old_osd_node_names,
                                                       timeout)
        assert new_osd_node_name, (
            f"New osd node not found after the node replacement process "
            f"while waiting for {timeout} seconds")
    elif num_of_old_osd_nodes < num_of_old_ocs_nodes:
        num_of_extra_old_ocs_nodes = num_of_old_ocs_nodes - num_of_old_osd_nodes
        log.info(
            f"We have {num_of_extra_old_ocs_nodes} existing extra OCS worker nodes in the cluster"
            f"Wait for one of the existing OCS nodes to appear in the osd nodes"
        )
        timeout = 600
        new_osd_node_name = node.wait_for_new_osd_node(old_osd_node_names,
                                                       timeout)
        assert new_osd_node_name, (
            f"New osd node not found after the node replacement process "
            f"while waiting for {timeout} seconds")
    else:
        log.info(
            f"We have more than {min_osd_nodes} osd nodes in the cluster, and also we don't have "
            f"an existing extra OCS worker nodes in the cluster. Don't wait for the new osd node"
        )
        new_osd_node_name = None

    assert node.node_replacement_verification_steps_ceph_side(
        old_node_name, new_node_name, new_osd_node_name)
    assert node.node_replacement_verification_steps_user_side(
        old_node_name, new_node_name, new_osd_node_name, old_osd_ids)
Beispiel #4
0
    def get_osd_info(self):
        """
        Getting the OSD's information and update the main environment
        dictionary.

        """
        ct_pod = pod.get_ceph_tools_pod()
        osd_info = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df")
        self.environment["osd_size"] = osd_info.get("nodes")[0].get("crush_weight")
        self.environment["osd_num"] = len(osd_info.get("nodes"))
        self.environment["total_capacity"] = osd_info.get("summary").get(
            "total_kb_avail"
        )
        self.environment["ocs_nodes_num"] = len(node.get_ocs_nodes())
    def test_toleration(self):
        """
        1. Check if nodes are tainted
        2. Taint ocs nodes if not tainted
        3. Check for tolerations on all pod
        4. Respin all ocs pods and check if it runs on ocs nodes
        5. Untaint nodes

        """
        # taint nodes if not already tainted
        nodes = get_ocs_nodes()
        taint_nodes(nodes)

        # Check tolerations on pods under openshift-storage
        check_toleration_on_pods()

        # Respin all pods and check it if is still running
        pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod in pod_list:
            pod.delete(wait=False)
        assert wait_for_pods_to_be_running(timeout=300)
Beispiel #6
0
    def test_check_pods_status_after_node_failure(self, nodes,
                                                  node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(
            node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name],
                              status=constants.NODE_NOT_READY)
        log.info(
            f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status"
        )

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout)
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info(
            "Check the rook ceph pods are in 'Running' or 'Completed' state")
        timeout = 480
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node,
            timeout=timeout,
            sleep=30)
        assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds"

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name
            for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name
            for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set)
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(rook_ceph_pod_names_set -
                                       new_node_osd_mon_id_names_set)

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20)
        assert (are_new_pods_running
                ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")
        log.info(f"Starting the node '{node_name}' again...")
        nodes.start_nodes(nodes=[ocs_node])
        wait_for_nodes_status(node_names=[node_name])

        log.info(
            "Waiting for all the pods to be running and cluster health to be OK..."
        )
        wait_for_pods_to_be_running(timeout=600)
        self.sanity_helpers.health_check(tries=40)
Beispiel #7
0
    def test_check_pods_status_after_node_failure(self, nodes, node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        wnodes = get_worker_nodes()

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name], status=constants.NODE_NOT_READY)
        log.info(f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status")

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout
        )
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info("Check the rook ceph pods are in 'Running' or 'Completed' state")
        previous_timeout = 480
        timeout = 600
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node, timeout=timeout, sleep=30
        )
        assert are_pods_running, (
            f"Increased timeout from {previous_timeout} to {timeout} seconds, "
            f"The pods are not 'Running' even after {timeout} seconds"
        )

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set
        )
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(
            rook_ceph_pod_names_set - new_node_osd_mon_id_names_set
        )

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20
        )
        assert (
            are_new_pods_running
        ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")

        if is_managed_service_cluster():
            log.info(
                "When we use the managed service, the worker node should recover automatically "
                "by starting the node or removing it, and creating a new one."
                "Waiting for all the worker nodes to be ready..."
            )
            wait_for_node_count_to_reach_status(node_count=len(wnodes), timeout=900)
            log.info("Waiting for all the pods to be running")
            assert check_pods_after_node_replacement(), "Not all the pods are running"
        else:
            log.info(f"Starting the node '{node_name}' again...")
            nodes.start_nodes(nodes=[ocs_node])
            wait_for_nodes_status(node_names=[node_name])
            log.info("Waiting for all the pods to be running")
            wait_for_pods_to_be_running(timeout=600)

        log.info("Checking that the cluster health is OK...")
        self.sanity_helpers.health_check(tries=40)