def test_automated_recovery_from_stopped_node_and_start(
        self, nodes, additional_node
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI

        0) A - add new node, B - don't add new node
        1) Stop node
        2) Validate result:
             A - pods should respin on the new node
             B - pods should remain in Pending state on the stopped node
        3) Start node
        4) Validate result:
             A - pods should start on the new node
             B - pods should start on the stopped node after starting it
        """
        wnode_name = get_worker_nodes()[0]
        machine_name = machine.get_machine_from_node_name(wnode_name)
        self.machineset_name = machine.get_machineset_from_machine_name(machine_name)
        self.start_ready_replica_count = machine.get_ready_replica_count(
            self.machineset_name
        )

        temp_osd = get_osd_pods()[0]
        osd_real_name = "-".join(temp_osd.name.split("-")[:-1])
        self.osd_worker_node = [get_pod_node(temp_osd)]
        if additional_node:
            self.add_new_storage_node(self.osd_worker_node[0].name)
            self.extra_node = True
        nodes.stop_nodes(self.osd_worker_node, wait=True)
        log.info(f"Successfully powered off node: {self.osd_worker_node[0].name}")

        timeout = 420
        assert wait_for_rook_ceph_pod_status(
            temp_osd, constants.STATUS_TERMINATING, timeout
        ), (
            f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} "
            f"after {timeout} seconds"
        )

        # Validate that the OSD in terminate state has a new OSD in Pending
        all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        new_osd = None
        for pod_obj in all_pod_obj:
            if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and (
                temp_osd.name != pod_obj.name
            ):
                new_osd = pod_obj
                break

        nodes.start_nodes(nodes=self.osd_worker_node, wait=True)
        log.info(f"Successfully powered on node: {self.osd_worker_node[0].name}")
        wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180)
        if additional_node:
            new_osd_node = get_pod_node(new_osd)
            assert (
                new_osd_node.name != self.osd_worker_node[0].name
            ), "New OSD is expected to run on the new additional node"
Esempio n. 2
0
def noobaa_running_node_restart(pod_name):
    """
    Function to restart node which has noobaa pod's running

    Args:
        pod_name (str): Name of noobaa pod

    """

    nb_pod_obj = pod.get_pod_obj(
        (get_pod_name_by_pattern(
            pattern=pod_name,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0],
        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
    )
    nb_node_name = pod.get_pod_node(nb_pod_obj).name
    factory = platform_nodes.PlatformNodesFactory()
    nodes = factory.get_nodes_platform()
    nb_nodes = get_node_objs(node_names=nb_node_name)
    log.info(f"{pod_name} is running on {nb_node_name}")
    log.info(f"Restating node: {nb_node_name}....")
    nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True)

    # Validate nodes are up and running
    wait_for_nodes_status()
    ceph_health_check(tries=30, delay=60)
    helpers.wait_for_resource_state(nb_pod_obj,
                                    constants.STATUS_RUNNING,
                                    timeout=180)
    def mgr_pod_node_restart(self):
        """
        Restart node that runs mgr pod
        """
        mgr_pod_obj = pod.get_mgr_pods()
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        self.nodes.restart_nodes([mgr_node_obj])

        wait_for_nodes_status()

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(
            condition="Running", selector="app=rook-ceph-mgr", timeout=600
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )
    def test_monitoring_after_rebooting_node_where_mgr_is_running(self):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod
        """

        aws_obj = aws.AWS()

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        instances = aws.get_instances_ids_and_names([mgr_node_obj])
        aws_obj.restart_ec2_instances(instances=instances,
                                      wait=True,
                                      force=True)

        # Validate all nodes are in READY state
        wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Esempio n. 5
0
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        typed_node_name = typed_nodes[0].name
        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 6
0
    def disrupt_plugin_provisioner_pods(self, node_list):
        """
        Set leader plugin-provisioner resources for disruption, skip if running
        on node from the node_list

        Args:
            node_list (list): list of node names to check

        Returns:
            list: list of Disruption objects

        """
        provisioner_resource = []
        for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]:
            provisioner_pod = pod.get_plugin_provisioner_leader(
                interface=interface)
            node_name = pod.get_pod_node(provisioner_pod).name
            if node_name not in node_list:
                if interface == constants.CEPHBLOCKPOOL:
                    provisioner_resource.append("rbdplugin_provisioner")
                else:
                    provisioner_resource.append("cephfsplugin_provisioner")

        disruptor = []
        for resource in provisioner_resource:
            disruption = disruption_helpers.Disruptions()
            disruption.set_resource(resource=resource)
            disruptor.append(disruption)

        return disruptor
    def test_ceph_csidriver_runs_on_non_ocs_nodes(
        self, pvc_factory, pod_factory, add_nodes
    ):
        """
        1. Add non ocs nodes
        2. Taint new nodes with app label
        3. Check if plugin pods running on new nodes
        4. Create app-pods on app_nodes
        """

        # Add worker nodes and tainting it as app_nodes
        add_nodes(ocs_nodes=False, taint_label="nodetype=app:NoSchedule")

        # Checks for new plugin pod respinning on new app-nodes
        app_nodes = [node.name for node in get_worker_nodes_not_in_ocs()]
        interfaces = [constants.CEPHFILESYSTEM, constants.CEPHBLOCKPOOL]
        logger.info("Checking for plugin pods on non-ocs worker nodes")
        for interface in interfaces:
            pod_objs = get_plugin_pods(interface)
            for pod_obj in pod_objs:
                node_obj = get_pod_node(pod_obj)
                try:
                    if node_obj.name in app_nodes:
                        logger.info(
                            f"The plugin pod {pod_obj.name} is running on app_node {node_obj.name}"
                        )
                        continue
                except Exception as e:
                    logging.info(f"Plugin pod was not found on {node_obj.name} - {e}")

        # Creates app-pods on app-nodes
        for node in app_nodes:
            pvc_obj = pvc_factory()
            pod_factory(pvc=pvc_obj, node_name=node)
Esempio n. 8
0
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

            # Verify OSD encrypted
            if config.ENV_DATA.get("encryption_at_rest"):
                osd_encryption_verification()

            logger.info("Clear crash warnings and osd removal leftovers")
            clear_crash_warning_and_osd_removal_leftovers()
    def test_rgw_host_node_failure(
        self, nodes, node_restart_teardown, mcg_obj, bucket_factory
    ):
        """
        Test case to fail node where RGW and Noobaa-db-0 hosting
        and verify new pod spuns on healthy node

        """
        # Get rgw pods
        rgw_pod_obj = get_rgw_pods()

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name == "noobaa-db-0":
                noobaa_pod_node = get_pod_node(noobaa_pod)

        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(
                    f"Stopping node {pod_node} where"
                    f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted"
                )
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(
                    resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720
                )

                # Validate new rgw pod spun
                ocp_obj = OCP(
                    kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE
                )
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1")

                # Start the node
                nodes.start_nodes(node_obj)

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()

        # Verify all storage pods are running
        wait_for_storage_pods()
Esempio n. 10
0
def get_osd_running_nodes():
    """
    Gets the osd running node names

    Returns:
        list: OSD node names

    """
    return [pod.get_pod_node(osd_node).name for osd_node in pod.get_osd_pods()]
Esempio n. 11
0
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                try:
                    wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=200)
                # 'rook-ceph-crashcollector' on the failed node stucks at pending
                # state. BZ 1810014 tracks it.
                # Ignoring 'rook-ceph-crashcollector' pod health check as WA and
                # deleting its deployment so that the pod disappears
                # Will revert this WA once the BZ is fixed
                except ResourceWrongStatusException:
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP()
                        name = pod_obj.name[:-17]
                        command = f"delete deployment {name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 12
0
def get_osds_per_node():
    """
    Gets the osd running pod names per node name

    Returns:
        dict: {"Node name":["osd running pod name running on the node",..,]}

    """
    dic_node_osd = defaultdict(list)
    for osd_pod in pod.get_osd_pods():
        dic_node_osd[pod.get_pod_node(osd_pod).name].append(osd_pod.name)
    return dic_node_osd
Esempio n. 13
0
    def get_pgbench_running_nodes(self):
        """
        get nodes that contains pgbench pods

        Returns:
            list: List of pgbench running nodes

        """
        pgbench_nodes = [
            get_pod_node(pgbench_pod).name for pgbench_pod in self.get_pgbench_pods()
        ]
        return list(set(pgbench_nodes))
def select_osd_node_name():
    """
    select randomly one of the osd nodes

    Returns:
        str: the selected osd node name

    """
    osd_pods_obj = pod.get_osd_pods()
    osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
    log.info(f"Selected OSD is {osd_node_name}")
    return osd_node_name
Esempio n. 15
0
def get_app_pod_running_nodes(pod_obj):
    """
    Gets the app pod running node names

    Args:
        pod_obj (list): List of app pod objects

    Returns:
        list: App pod running node names

    """
    return [pod.get_pod_node(obj_pod).name for obj_pod in pod_obj]
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        retry((CommandFailed, ResourceWrongStatusException),
              tries=20,
              delay=15)(wait_for_nodes_status())

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition="Running",
                                         selector="app=rook-ceph-mgr",
                                         timeout=600)
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
Esempio n. 17
0
def get_ocs_operator_node_name():
    """
    Getting node's name that running ocs-operator pod

    Returns:
        str: node's name that running ocs-operator pod

    """
    ocs_operator_pod = get_ocs_operator_pod()
    log.debug(f"ocs operator pod info: {ocs_operator_pod}")
    ocs_operator_node = get_pod_node(ocs_operator_pod)

    return get_node_name(ocs_operator_node)
Esempio n. 18
0
def node_replacement_verification_steps_ceph_side(old_node_name,
                                                  new_node_name):
    """
    Check the verification steps from the Ceph side, after the process
    of node replacement as described in the docs

    Args:
        old_node_name (str): The name of the old node that has been deleted
        new_node_name (str): The name of the new node that has been created

    Returns:
        bool: True if all the verification steps passed. False otherwise

    """
    if old_node_name == new_node_name:
        log.warning("Hostname didn't change")
        return False

    wait_for_nodes_status([new_node_name])
    # It can take some time until all the ocs pods are up and running
    # after the process of node replacement
    if not pod.wait_for_pods_to_be_running():
        log.warning("Not all the pods in running state")
        return False

    ct_pod = pod.get_ceph_tools_pod()
    ceph_osd_status = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd status")
    if new_node_name not in ceph_osd_status:
        log.warning("new node name not found in 'ceph osd status' output")
        return False
    if old_node_name in ceph_osd_status:
        log.warning("old node name found in 'ceph osd status' output")
        return False

    osd_pods_obj = pod.get_osd_pods()
    osd_node_names = [pod.get_pod_node(p).name for p in osd_pods_obj]
    if new_node_name not in osd_node_names:
        log.warning("the new hostname not found in osd node names")
        return False
    if old_node_name in osd_node_names:
        log.warning("the old hostname found in osd node names")
        return False

    from ocs_ci.ocs.cluster import check_ceph_osd_tree_after_node_replacement

    if not check_ceph_osd_tree_after_node_replacement():
        return False

    log.info("Verification steps from the ceph side finish successfully")
    return True
Esempio n. 19
0
    def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20)
                pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20)
                pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

        if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM:
            if config.ENV_DATA['deployment_type'] == 'ipi':
                node.delete_and_create_osd_node_aws_ipi(osd_node_name)

            elif config.ENV_DATA['deployment_type'] == 'upi':
                node.delete_and_create_osd_node_aws_upi(osd_node_name)
            else:
                pytest.fail(
                    f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, "
                    f"results of this test run are all invalid.")

        elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                        )

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=30)
Esempio n. 20
0
def get_node_pods(node_name, pods_to_search=None):
    """
    Get all the pods of a specified node

    Args:
        node_name (str): The node name to get the pods
        pods_to_search (list): list of pods to search for the node pods.
            If not specified, will search in all the pods.

    Returns:
        list: list of all the pods of the specified node

    """
    pods_to_search = pods_to_search or pod.get_all_pods()
    return [p for p in pods_to_search if pod.get_pod_node(p).name == node_name]
    def restart_ocs_operator_node(self):
        """
        Restart node that runs OCS operator pod
        """

        pod_obj = pod.get_ocs_operator_pod()
        node_obj = pod.get_pod_node(pod_obj)

        self.nodes.restart_nodes([node_obj])

        wait_for_nodes_status()

        pod.wait_for_pods_to_be_running(
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name]
        )
    def test_monitoring_shutdown_and_recovery_prometheus_node(
            self, nodes, pods):
        """
        Test case to validate whether shutdown and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get all prometheus pods
        prometheus_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for prometheus_pod_obj in prometheus_pod_obj_list:
            # Get the node where the prometheus pod is hosted
            prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj)

            # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted
            nodes.stop_nodes([prometheus_node_obj])

            waiting_time = 20
            log.info(f"Waiting for {waiting_time} seconds")
            time.sleep(waiting_time)

            nodes.start_nodes(nodes=[prometheus_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check all the prometheus pods are up
        for pod_obj in prometheus_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for the created pvc metrics after shutdown and recovery of prometheus nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Esempio n. 23
0
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=200)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Esempio n. 24
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, nodes, pods):
        """
        Test case to validate when the prometheus pod is down and its
        interaction with prometheus

        """

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the node where the prometheus pod is hosted
            pod_node_obj = pod.get_pod_node(pod_obj)

            # Make one of the node down where the prometheus pod is hosted
            nodes.restart_nodes([pod_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check all the prometheus pods are up
        for pod_obj in pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check for the created pvc metrics after restarting node where prometheus pod is hosted
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
            log.info(
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected"
            )
Esempio n. 26
0
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])
    def setup(self, interface, pvc_factory, service_account_factory,
              teardown_factory):
        """
        Create dc pod with replica 5
        """
        self.replica_count = 5
        pvc_obj = pvc_factory(interface=interface, size=3)
        sa_obj = service_account_factory(project=pvc_obj.project)
        try:
            pod1 = create_pod(
                interface_type=interface,
                pvc_name=pvc_obj.name,
                namespace=pvc_obj.namespace,
                sa_name=sa_obj.name,
                dc_deployment=True,
                replica_count=self.replica_count,
                deploy_pod_status=constants.STATUS_RUNNING,
            )
        except TimeoutExpiredError:
            # The test cannot be continued if all the pods are created on the same node
            pods = pod.get_all_pods(namespace=pvc_obj.namespace)
            pod_nodes = [pod.get_pod_node(pod_obj).name for pod_obj in pods]
            if set(pod_nodes) == 1:
                pytest.skip(
                    "All pods are created on same node and reached Running state"
                )
            raise

        self.name = pod1.labels["name"]
        self.namespace = pod1.namespace

        dc_obj = OCP(
            kind=constants.DEPLOYMENTCONFIG,
            namespace=self.namespace,
            resource_name=self.name,
        )
        dc_info = dc_obj.get(resource_name=self.name,
                             selector=f"app={self.name}")["items"][0]

        dc_obj = OCS(**dc_info)
        teardown_factory(dc_obj)
Esempio n. 28
0
    def test_delete_local_volume_sym_link(self):
        """
        Delete sym link on LSO Cluster
        """
        # Get rook-ceph-crashcollector pod objects
        crashcollector_pods = get_pod_name_by_pattern(
            pattern="rook-ceph-crashcollector",
            namespace=ROOK_CLUSTER_NAMESPACE)
        crashcollector_pods_objs = []
        for crashcollector_pod in crashcollector_pods:
            crashcollector_pods_objs.append(
                get_pod_obj(name=crashcollector_pod,
                            namespace=ROOK_CLUSTER_NAMESPACE))

        # Get Node object
        node_obj = get_pod_node(pod_obj=crashcollector_pods_objs[0])

        # Get Sym link
        osd_pvcs = get_deviceset_pvcs()
        pv_name = osd_pvcs[0].data["spec"]["volumeName"]
        ocp_obj = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE, kind=constants.PV)
        pv_obj = ocp_obj.get(resource_name=pv_name)
        path = pv_obj["spec"]["local"]["path"]

        log.info("Delete sym link")
        oc_cmd = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE)
        cmd = f"rm -rfv {path}"
        oc_cmd.exec_oc_debug_cmd(node=node_obj.name, cmd_list=[cmd])

        log.info(
            "Waiting for rook-ceph-crashcollector pods to be reach Running state"
        )
        for crashcollector_pods_obj in crashcollector_pods_objs:
            wait_for_resource_state(resource=crashcollector_pods_obj,
                                    state=constants.STATUS_RUNNING)

        # Check all OCS pods status, they should be in Running or Completed state
        wait_for_storage_pods()

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
Esempio n. 29
0
def measure_stop_worker_nodes(request, measurement_dir, nodes):
    """
    Stop worker nodes that doesn't contain RGW (so that alerts are triggered
    correctly), measure the time when it was stopped and monitors alerts that
    were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            worker node

    """
    mgr_pod = pod.get_mgr_pods()[0]
    mgr_node = pod.get_pod_node(mgr_pod)
    test_nodes = [
        worker_node
        for worker_node in get_nodes(node_type=constants.WORKER_MACHINE)
        if worker_node.name != mgr_node.name
    ]

    def stop_nodes():
        """
        Turn off test nodes for 5 minutes.

        Returns:
            list: Names of nodes that were turned down

        """
        # run_time of operation
        run_time = 60 * 5
        nonlocal test_nodes
        node_names = [node.name for node in test_nodes]
        logger.info(f"Turning off nodes {node_names}")
        nodes.stop_nodes(nodes=test_nodes)
        # Validate node reached NotReady state
        wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY)
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return node_names

    def finalizer():
        nodes.restart_nodes_by_stop_and_start_teardown()
        assert ceph_health_check(), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

    request.addfinalizer(finalizer)

    test_file = os.path.join(measurement_dir, "measure_stop_nodes.json")
    if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS:
        # It seems that it takes longer to propagate incidents to PagerDuty.
        # Adding 3 extra minutes
        measured_op = measure_operation(stop_nodes, test_file, minimal_time=60 * 8)
    else:
        measured_op = measure_operation(stop_nodes, test_file)
    logger.info("Turning on nodes")
    try:
        nodes.start_nodes(nodes=test_nodes)
    except CommandFailed:
        logger.warning(
            "Nodes were not found: they were probably recreated. Check ceph health below"
        )
    # Validate all nodes are in READY state and up
    retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)(
        wait_for_nodes_status
    )(timeout=900)

    # wait for ceph to return into HEALTH_OK state after mgr deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
Esempio n. 30
0
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        logger.info("Picking a PV which to be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [
            ds for ds in osd_pvcs
            if ds.get().get("metadata").get("name") == claim_name
        ][0]

        # Get the corresponding OSD pod and ID
        logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        logger.info(f"OSD_POD {osd_pod.name}")
        osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id")

        # Get the node that has the OSD pod running on
        logger.info(
            f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments()
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_deployment_name = osd_deployment.name

        # Delete the volume from the platform side
        logger.info(f"Deleting {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Scale down OSD deployment
        logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
        ocp.OCP().exec_oc_cmd(
            f"scale --replicas=0 deployment/{osd_deployment_name}")

        # Force delete OSD pod if necessary
        osd_pod_name = osd_pod.name
        logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
        try:
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
        except TimeoutError:
            osd_pod.delete(force=True)
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

        # Run ocs-osd-removal job
        ocp_version = float(get_ocp_version())
        if ocp_version >= 4.6:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml"
        else:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml"

        logger.info(f"Executing OSD removal job on OSD-{osd_id}")
        ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
        osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd)
        osd_removal_job = OCS(**osd_removal_job_yaml)
        osd_removal_job.create(do_reload=False)

        # Get ocs-osd-removal pod name
        logger.info("Getting the ocs-osd-removal pod name")
        osd_removal_pod_name = get_osd_removal_pod_name(osd_id)
        osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name,
                                          namespace="openshift-storage")
        osd_removal_pod_obj.ocp.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=osd_removal_pod_name)

        # Verify OSD removal from the ocs-osd-removal pod logs
        logger.info(
            f"Verifying removal of OSD from {osd_removal_pod_name} pod logs")
        logs = get_pod_logs(osd_removal_pod_name)
        pattern = f"purged osd.{osd_id}"
        assert re.search(pattern, logs)

        osd_pvc_name = osd_pvc.name

        if ocp_version < 4.6:
            # Delete the OSD prepare job
            logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
            osd_prepare_job.delete()
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=120)

            # Delete the OSD PVC
            logger.info(f"Deleting OSD PVC {osd_pvc_name}")
            osd_pvc.delete()
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

            # Delete the OSD deployment
            logger.info(f"Deleting OSD deployment {osd_deployment_name}")
            osd_deployment.delete()
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=120)
        else:
            # If ocp version is '4.6' and above the osd removal job should
            # delete the OSD prepare job, OSD PVC, OSD deployment
            logger.info(
                f"Verifying deletion of OSD prepare job {osd_prepare_job_name}"
            )
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=30)
            logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}")
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30)
            logger.info(
                f"Verifying deletion of OSD deployment {osd_deployment_name}")
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=30)

        # Delete PV
        logger.info(f"Verifying deletion of PV {osd_pv_name}")
        try:
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
        except TimeoutError:
            osd_pv.delete()
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

        if ocp_version < 4.6:
            # Delete the rook ceph operator pod to trigger reconciliation
            rook_operator_pod = get_operator_pods()[0]
            logger.info(
                f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
            rook_operator_pod.delete()

        # Delete the OSD removal job
        logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
        osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}")
        osd_removal_job.delete()
        osd_removal_job.ocp.wait_for_delete(
            resource_name=f"ocs-osd-removal-{osd_id}")

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info(
            "Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_BOUND,
            selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
            )
        # Wait for OSD pod to get created and reach Running state
        logger.info(
            "Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
            )

        # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
        # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
        if ocp_version >= 4.6:
            silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
                osd_pod_name)
            if not silence_osd_crash:
                logger.info("Didn't find ceph osd crash warning")

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=100)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)