コード例 #1
0
    def test_add_node(self):
        """
        Test for adding worker nodes to the cluster while IOs
        """
        new_nodes = 3
        if config.ENV_DATA['platform'].lower() in constants.CLOUD_PLATFORMS:
            dt = config.ENV_DATA['deployment_type']
            if dt == 'ipi':
                machines = machine_utils.get_machinesets()
                logger.info(
                    f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}'
                )
                for machine in machines:
                    add_new_node_and_label_it(machine)
                logger.info(
                    f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}'
                )

            else:
                logger.info(
                    f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}'
                )
                if config.ENV_DATA.get('rhel_workers'):
                    node_type = constants.RHEL_OS
                else:
                    node_type = constants.RHCOS
                assert add_new_node_and_label_upi(node_type,
                                                  new_nodes), "Add node failed"
                logger.info(
                    f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}'
                )

        elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521")
コード例 #2
0
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name:
                try:
                    wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=200)
                # 'rook-ceph-crashcollector' on the failed node stucks at pending
                # state. BZ 1810014 tracks it.
                # Ignoring 'rook-ceph-crashcollector' pod health check as WA and
                # deleting its deployment so that the pod disappears
                # Will revert this WA once the BZ is fixed
                except ResourceWrongStatusException:
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP()
                        name = pod_obj.name[:-17]
                        command = f"delete deployment {name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def add_new_storage_node(self, node_name):
        machine_name = machine.get_machine_from_node_name(node_name)
        log.info(f"{node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(machine_name)
        log.info(f"{node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
コード例 #4
0
    def test_add_node(self):
        """
        Test for adding worker nodes to the cluster while IOs
        """
        new_nodes = 3
        if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS:
            dt = config.ENV_DATA["deployment_type"]
            if dt == "ipi":
                machines = machine_utils.get_machinesets()
                logger.info(
                    f"The worker nodes number before expansion {len(node.get_worker_nodes())}"
                )
                for machine in machines:
                    add_new_node_and_label_it(machine)
                logger.info(
                    f"The worker nodes number after expansion {len(node.get_worker_nodes())}"
                )

            else:
                logger.info(
                    f"The worker nodes number before expansion {len(node.get_worker_nodes())}"
                )
                if config.ENV_DATA.get("rhel_workers"):
                    node_type = constants.RHEL_OS
                else:
                    node_type = constants.RHCOS
                assert add_new_node_and_label_upi(node_type,
                                                  new_nodes), "Add node failed"
                logger.info(
                    f"The worker nodes number after expansion {len(node.get_worker_nodes())}"
                )

        elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521")
            # Issue to remove skip code https://github.com/red-hat-storage/ocs-ci/issues/2403
            # logger.info(f'The worker nodes number before expansion {len(node.get_worker_nodes())}')
            # if config.ENV_DATA.get('rhel_user'):
            #     pytest.skip("Skipping add RHEL node, code unavailable")
            # node_type = constants.RHCOS
            # assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed"
            # logger.info(f'The worker nodes number after expansion {len(node.get_worker_nodes())}')
        ceph_cluster_obj = CephCluster()
        assert ceph_cluster_obj.wait_for_rebalance(
            timeout=3600), "Data re-balance failed to complete"
コード例 #5
0
 def factory(additional_nodes=3):
     """
     Args:
         additional_nodes (int): Number of additional nodes to be added (default=3).
     """
     log.info("Creating machineset")
     machineset_name.append(
         machine.create_custom_machineset(instance_type="m5.4xlarge",
                                          zone="a"))
     machine.wait_for_new_node_to_be_ready(machineset_name[0])
     log.info(
         f"Adding {additional_nodes} more nodes to machineset {machineset_name[0]}"
     )
     node.add_new_node_and_label_it(
         machineset_name=machineset_name[0],
         num_nodes=additional_nodes,
         mark_for_ocs_label=False,
     )
     machine.wait_for_new_node_to_be_ready(machineset_name[0])
コード例 #6
0
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=200)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
コード例 #7
0
ファイル: test_node_expansion.py プロジェクト: wusui/ocs-ci
    def test_add_node_aws(self):
        """
        Test for adding worker nodes to the cluster while IOs
        """
        dt = config.ENV_DATA['deployment_type']
        if dt == 'ipi':
            machines = machine_utils.get_machinesets()
            logger.info(f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}')
            for machine in machines:
                add_new_node_and_label_it(machine)
            logger.info(f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}')

        else:
            new_nodes = 3
            logger.info(f'The worker nodes number before expansion {len(helpers.get_worker_nodes())}')
            if config.ENV_DATA.get('rhel_workers'):
                node_type = constants.RHEL_OS
            else:
                node_type = constants.RHCOS
            assert add_new_node_and_label_upi(node_type, new_nodes), "Add node failed"
            logger.info(f'The worker nodes number after expansion {len(helpers.get_worker_nodes())}')
コード例 #8
0
    def identify_and_add_nodes(self, scenario, num_of_nodes):
        """
        Fetches info about the worker nodes and add nodes (if required)

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test

        Returns:
            tuple: tuple containing:
                list: list of OCS nodes name
                list: list of non-OCS nodes name

        """
        nodes_to_add = 0
        initial_worker_nodes = node.get_worker_nodes()
        ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
        non_ocs_nodes = list(set(initial_worker_nodes) - set(ocs_nodes))

        if "colocated" in scenario and len(ocs_nodes) < num_of_nodes:
            nodes_to_add = num_of_nodes - len(initial_worker_nodes)

        if "dedicated" in scenario and len(non_ocs_nodes) < num_of_nodes:
            nodes_to_add = num_of_nodes - len(non_ocs_nodes)

        if nodes_to_add > 0:
            logger.info(f"{nodes_to_add} extra workers nodes needed")

            if config.ENV_DATA["deployment_type"] == "ipi":
                machine_name = random.choice(
                    machine.get_machines(
                        machine_type=constants.WORKER_MACHINE)).name
                machineset_name = machine.get_machineset_from_machine_name(
                    machine_name)
                node.add_new_node_and_label_it(
                    machineset_name=machineset_name,
                    num_nodes=nodes_to_add,
                    mark_for_ocs_label=False,
                )
            else:
                is_rhel = config.ENV_DATA.get(
                    "rhel_workers") or config.ENV_DATA.get("rhel_user")
                node_type = constants.RHEL_OS if is_rhel else constants.RHCOS
                node.add_new_node_and_label_upi(
                    node_type=node_type,
                    num_nodes=nodes_to_add,
                    mark_for_ocs_label=False,
                )

            new_worker_nodes = node.get_worker_nodes()
            new_nodes_added = list(
                set(new_worker_nodes) - set(initial_worker_nodes))
            assert (len(new_nodes_added) == nodes_to_add
                    ), "Extra nodes not added in the cluster"
            non_ocs_nodes += new_nodes_added

        if "colocated" in scenario and len(ocs_nodes) < num_of_nodes:
            logger.info("Adding OCS storage label to Non-OCS workers")
            node_obj = ocp.OCP(kind=constants.NODE)
            nodes_to_label = non_ocs_nodes[0:(num_of_nodes - len(ocs_nodes))]
            for node_name in nodes_to_label:
                node_obj.add_label(resource_name=node_name,
                                   label=constants.OPERATOR_NODE_LABEL)
                ocs_nodes.append(node_name)
            non_ocs_nodes = list(set(non_ocs_nodes) - set(ocs_nodes))

        logger.info(f"The OCS nodes are : {ocs_nodes}")
        logger.info(f"The Non-OCS nodes are: {non_ocs_nodes}")
        return ocs_nodes, non_ocs_nodes
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")
                    else:
                        raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
コード例 #10
0
    def test_simultaneous_drain_of_two_ocs_nodes(
        self,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key="dc",
                          label_value="fedora")
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == "rbd" else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if "rook-ceph-crashcollector" in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = "-".join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
コード例 #11
0
ファイル: conftest.py プロジェクト: liranmauda/ocs-ci
    def factory(ocs_nodes=False, node_count=3, taint_label=None):
        """
        Args:
            ocs_nodes (bool): True if new nodes are OCS, False otherwise
            node_count (int): Number of nodes to be added
            taint_label (str): Taint label to be added

        """

        new_nodes = []
        if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS:
            dt = config.ENV_DATA["deployment_type"]
            if dt == "ipi":
                machines = machine_utils.get_machinesets()
                log.info(
                    f"The worker nodes number before expansion {len(get_worker_nodes())}"
                )
                for machine in machines:
                    new_nodes.append(
                        add_new_node_and_label_it(
                            machine, mark_for_ocs_label=ocs_nodes))

                log.info(
                    f"The worker nodes number after expansion {len(get_worker_nodes())}"
                )

            else:
                log.info(
                    f"The worker nodes number before expansion {len(get_worker_nodes())}"
                )
                if config.ENV_DATA.get("rhel_workers"):
                    node_type = constants.RHEL_OS
                else:
                    node_type = constants.RHCOS

                new_nodes.append(
                    add_new_node_and_label_upi(node_type,
                                               node_count,
                                               mark_for_ocs_label=ocs_nodes))
                log.info(
                    f"The worker nodes number after expansion {len(get_worker_nodes())}"
                )

        elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            log.info(
                f"The worker nodes number before expansion {len(get_worker_nodes())}"
            )
            if config.ENV_DATA.get("rhel_user"):
                node_type = constants.RHEL_OS
            else:
                node_type = constants.RHCOS

            new_nodes.append(
                add_new_node_and_label_upi(node_type,
                                           node_count,
                                           mark_for_ocs_label=ocs_nodes))
            log.info(
                f"The worker nodes number after expansion {len(get_worker_nodes())}"
            )

        nodes = [node for sublist in new_nodes for node in sublist]

        if taint_label:
            taint_nodes(nodes=nodes,
                        taint_label=taint_label), "Failed to taint nodes"
        log.info(f"Successfully Tainted nodes {new_nodes} with {taint_label}")
コード例 #12
0
    def setup(
        self,
        request,
        scenario,
        nodes,
        multi_pvc_factory,
        service_account_factory,
        dc_pod_factory,
    ):
        """
        Identify the nodes and start multiple dc pods for the test

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            nodes: A fixture to get instance of the relevant platform nodes class
            multi_pvc_factory: A fixture create a set of new PVCs
            service_account_factory: A fixture to create a service account
            dc_pod_factory: A fixture to create dc pod

        Returns:
            list: dc pod objs

        """
        worker_nodes = node.get_worker_nodes()
        ocs_nodes = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
        non_ocs_nodes = list(set(worker_nodes) - set(ocs_nodes))

        def finalizer():
            helpers.remove_label_from_worker_node(node_list=worker_nodes,
                                                  label_key="nodetype")

            # Check ceph health
            ceph_health_check(tries=80)

        request.addfinalizer(finalizer)

        if (scenario == "dedicated") and len(non_ocs_nodes) == 0:
            if config.ENV_DATA.get("deployment_type").lower() == "ipi":
                machines = machine.get_machinesets()
                node.add_new_node_and_label_it(machines[0],
                                               num_nodes=1,
                                               mark_for_ocs_label=False)
            else:
                if (config.ENV_DATA.get("platform").lower() ==
                        constants.VSPHERE_PLATFORM):
                    pytest.skip(
                        "Skipping add node in VSPHERE due to https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                    )
                is_rhel = config.ENV_DATA.get(
                    "rhel_workers") or config.ENV_DATA.get("rhel_user")
                node_type = constants.RHEL_OS if is_rhel else constants.RHCOS
                node.add_new_node_and_label_upi(node_type=node_type,
                                                num_nodes=1,
                                                mark_for_ocs_label=False)
            non_ocs_nodes = list(set(node.get_worker_nodes()) - set(ocs_nodes))

        app_pod_nodes = ocs_nodes if (scenario
                                      == "colocated") else non_ocs_nodes

        # Label nodes to be able to run app pods
        helpers.label_worker_node(node_list=app_pod_nodes,
                                  label_key="nodetype",
                                  label_value="app-pod")

        access_modes_rbd = [
            constants.ACCESS_MODE_RWO,
            f"{constants.ACCESS_MODE_RWX}-Block",
        ]

        access_modes_cephfs = [
            constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX
        ]

        pvcs_rbd = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            size=self.pvc_size,
            access_modes=access_modes_rbd,
            status=constants.STATUS_BOUND,
            num_of_pvc=len(access_modes_rbd),
        )

        project = pvcs_rbd[0].project

        pvcs_cephfs = multi_pvc_factory(
            interface=constants.CEPHFILESYSTEM,
            project=project,
            size=self.pvc_size,
            access_modes=access_modes_cephfs,
            status=constants.STATUS_BOUND,
            num_of_pvc=len(access_modes_cephfs),
        )

        pvcs = pvcs_cephfs + pvcs_rbd
        # Set volume mode on PVC objects
        for pvc_obj in pvcs:
            pvc_info = pvc_obj.get()
            setattr(pvc_obj, "volume_mode", pvc_info["spec"]["volumeMode"])

        sa_obj = service_account_factory(project=project)
        pods = []

        # Create pods
        for pvc_obj in pvcs:
            if constants.CEPHFS_INTERFACE in pvc_obj.storageclass.name:
                interface = constants.CEPHFILESYSTEM
            else:
                interface = constants.CEPHBLOCKPOOL

            num_pods = 2 if pvc_obj.access_mode == constants.ACCESS_MODE_RWX else 1
            logger.info("Creating app pods")
            for _ in range(num_pods):
                pods.append(
                    dc_pod_factory(
                        interface=interface,
                        pvc=pvc_obj,
                        node_selector={"nodetype": "app-pod"},
                        raw_block_pv=pvc_obj.volume_mode == "Block",
                        sa_obj=sa_obj,
                    ))

        logger.info(
            f"Created {len(pods)} pods using {len(pvcs_cephfs)} cephfs, {len(pvcs_rbd)} rbd PVCs."
        )

        return pods
    def test_automated_recovery_from_stopped_node_and_start(
            self, nodes, additional_node):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI

        0) A - add new node, B - don't add new node
        1) Stop node
        2) Validate result:
             A - pods should respin on the new node
             B - pods should remain in Pending state on the stopped node
        3) Start node
        4) Validate result:
             A - pods should start on the new node
             B - pods should start on the stopped node after starting it
        """
        wnode_name = get_worker_nodes()[0]
        machine_name = machine.get_machine_from_node_name(wnode_name)
        self.machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        self.start_ready_replica_count = machine.get_ready_replica_count(
            self.machineset_name)

        if additional_node:
            new_ocs_node_names = add_new_node_and_label_it(
                self.machineset_name)
            failure_domain = get_failure_domain()
            log.info("Wait for the nodes racks or zones to appear...")
            wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names)

            new_ocs_node = get_node_objs(new_ocs_node_names)[0]
            log.info(
                f"Successfully created a new OCS node '{new_ocs_node.name}'")
            self.extra_node = True
            log.info("Get another OSD node in the same rack or zone...")
            self.osd_worker_node = get_another_osd_node_in_same_rack_or_zone(
                failure_domain, new_ocs_node)
            assert (self.osd_worker_node
                    ), "Didn't find another osd node in the same rack or zone"
        else:
            osd_node_names = get_osd_running_nodes()
            self.osd_worker_node = get_node_objs(osd_node_names)[0]

        osd_pods = get_osd_pods()
        temp_osd = get_node_pods(self.osd_worker_node.name,
                                 pods_to_search=osd_pods)[0]
        osd_real_name = "-".join(temp_osd.name.split("-")[:-1])

        nodes.stop_nodes([self.osd_worker_node], wait=True)
        log.info(f"Successfully powered off node: {self.osd_worker_node.name}")

        timeout = 420
        assert pod.wait_for_pods_to_be_in_statuses(
            [constants.STATUS_TERMINATING], [temp_osd.name], timeout=timeout
        ), (f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} "
            f"after {timeout} seconds")

        # Validate that the OSD in terminate state has a new OSD in Pending
        all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        new_osd = None
        for pod_obj in all_pod_obj:
            if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and (
                    temp_osd.name != pod_obj.name):
                new_osd = pod_obj
                break

        nodes.start_nodes(nodes=[self.osd_worker_node], wait=True)
        log.info(f"Successfully powered on node: {self.osd_worker_node.name}")
        wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180)
        if additional_node:
            new_osd_node = get_pod_node(new_osd)
            assert (new_osd_node.name != self.osd_worker_node.name
                    ), "New OSD is expected to run on the new additional node"
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        failure,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        new_ocs_node_names = add_new_node_and_label_it(machineset_name)
        failure_domain = get_failure_domain()
        log.info("Wait for the nodes racks or zones to appear...")
        wait_for_nodes_racks_or_zones(failure_domain, new_ocs_node_names)

        new_ocs_node = get_node_objs(new_ocs_node_names)[0]
        osd_node_in_same_rack_or_zone = get_another_osd_node_in_same_rack_or_zone(
            failure_domain, new_ocs_node, common_nodes)
        # Get the failure node obj
        failure_node_obj = get_node_objs([osd_node_in_same_rack_or_zone.name])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods(timeout=300)

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            tries = 200
        else:
            tries = 40

        self.sanity_helpers.health_check(tries=tries)
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
コード例 #17
0
ファイル: conftest.py プロジェクト: rachael-george/ocs-ci
    def factory(ocs_nodes=False, node_count=3, taint_label=None):
        """
        Args:
            ocs_nodes (bool): True if new nodes are OCS, False otherwise
            node_count (int): Number of nodes to be added
            taint_label (str): Taint label to be added

        """

        new_nodes = []
        if config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS:
            dt = config.ENV_DATA["deployment_type"]
            if dt == "ipi":
                machines = machine_utils.get_machinesets()
                log.info(
                    f"The worker nodes number before expansion {len(get_worker_nodes())}"
                )
                for machine in machines:
                    new_nodes.append(
                        add_new_node_and_label_it(
                            machine, mark_for_ocs_label=ocs_nodes))

                log.info(
                    f"The worker nodes number after expansion {len(get_worker_nodes())}"
                )

            else:
                log.info(
                    f"The worker nodes number before expansion {len(get_worker_nodes())}"
                )
                if config.ENV_DATA.get("rhel_workers"):
                    node_type = constants.RHEL_OS
                else:
                    node_type = constants.RHCOS

                new_nodes.append(
                    add_new_node_and_label_upi(node_type,
                                               node_count,
                                               mark_for_ocs_label=ocs_nodes))
                log.info(
                    f"The worker nodes number after expansion {len(get_worker_nodes())}"
                )

        elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521")
            # Issue to remove skip code https://github.com/red-hat-storage/ocs-ci/issues/2403
            # log.info(
            #     f"The worker nodes number before expansion {len(get_worker_nodes())}"
            # )
            # if config.ENV_DATA.get("rhel_user"):
            #     pytest.skip("Skipping add RHEL node, code unavailable")
            # node_type = constants.RHCOS
            # new_nodes.append(
            #     add_new_node_and_label_upi(node_type, num_nodes=node_count)
            # )
            # log.info(
            #     f"The worker nodes number after expansion {len(get_worker_nodes())}"
            # )

        nodes = [node for sublist in new_nodes for node in sublist]

        if taint_label:
            taint_nodes(nodes=nodes,
                        taint_label=taint_label), "Failed to taint nodes"
        log.info(f"Successfully Tainted nodes {new_nodes} with {taint_label}")