Ejemplo n.º 1
0
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement proactive

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive(self):
        """
        Knip-894 Node Replacement proactive(without IO running)

        """
        osd_node_name = select_osd_node_name()
        delete_and_create_osd_node(osd_node_name)

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Verify OSD encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()

        ceph_cluster_obj = CephCluster()
        assert ceph_cluster_obj.wait_for_rebalance(
            timeout=1800), "Data re-balance failed to complete"
class TestNodeReplacementWithIO(ManageTest):
    """
    Knip-894 Node replacement proactive with IO

    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive_with_io_running(
        self, pvc_factory, pod_factory, dc_pod_factory
    ):
        """
        Knip-894 Node Replacement proactive when IO running in the background

        """

        # Get worker nodes
        worker_node_list = node.get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_node_name = select_osd_node_name()

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(
                    interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20
                )
                pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20
                )
                pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

        delete_and_create_osd_node(osd_node_name)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
Ejemplo n.º 3
0
class TestNoobaaBackupAndRecovery(E2ETest):
    """
    Test to verify noobaa backup and recovery

    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_noobaa_db_backup_and_recovery(
        self,
        pvc_factory,
        pod_factory,
        snapshot_factory,
        bucket_factory,
        rgw_bucket_factory,
        noobaa_db_backup_and_recovery,
    ):
        """
        Test case to verify noobaa backup and recovery

        1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC
        2. Scale down the statefulset noobaa-db
        3. Get the yaml of the current PVC, db-noobaa-db-0 and
           change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC
        4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed.
           The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’
        5. Edit again restore PV and remove the claimRef section.
           The volume will transition to Available.
        6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC.
        7. Scale up the stateful set again and the pod should be running

        """
        noobaa_db_backup_and_recovery(snapshot_factory=snapshot_factory)

        # Verify all storage pods are running
        wait_for_storage_pods()

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
Ejemplo n.º 4
0
class TestRollingWorkerNodeShutdownAndRecovery(ManageTest):
    """
    Test rolling shutdown and recovery of OCS pods running worker nodes
    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """

        def finalizer():
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(finalizer)

    def test_rolling_shutdown_and_recovery(
        self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
    ):
        SECONDS_TO_WAIT = 180
        """
        Test rolling shutdown and recovery of OCS worker nodes

        """
        # Get OCS worker node objects
        ocs_node_objs = get_ocs_nodes()

        # Start rolling shutdown and recovery of OCS worker nodes
        log.info("ShutDown OCS worker")
        for node_obj in ocs_node_objs:
            nodes.stop_nodes(nodes=[node_obj])
            log.info(f"Keeping node in stopped state for {SECONDS_TO_WAIT} mins")
            time.sleep(SECONDS_TO_WAIT)
            nodes.start_nodes(nodes=[node_obj])
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            log.info("Checking storage pods status")
            # Validate storage pods are running
            wait_for_pods_to_be_running(timeout=600)

        # Check basic cluster functionality by creating some resources
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
Ejemplo n.º 5
0
def test_deployment(pvc_factory, pod_factory):
    deploy = config.RUN["cli_params"].get("deploy")
    teardown = config.RUN["cli_params"].get("teardown")
    if not teardown or deploy:
        log.info("Verifying OCP cluster is running")
        assert is_cluster_running(config.ENV_DATA["cluster_path"])
        if not config.ENV_DATA["skip_ocs_deployment"]:
            if config.multicluster:
                restore_ctx_index = config.cur_index
                for cluster in get_non_acm_cluster_config():
                    config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
                    log.info(
                        f"Sanity check for cluster: {cluster.ENV_DATA['cluster_name']}"
                    )
                    sanity_helpers = Sanity()
                    sanity_helpers.health_check()
                    sanity_helpers.delete_resources()
                config.switch_ctx(restore_ctx_index)
            else:
                ocs_registry_image = config.DEPLOYMENT.get("ocs_registry_image")
                if config.ENV_DATA["mcg_only_deployment"]:
                    mcg_only_install_verification(ocs_registry_image=ocs_registry_image)
                    return
                else:
                    ocs_install_verification(ocs_registry_image=ocs_registry_image)

                # Check basic cluster functionality by creating resources
                # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
                # run IO and delete the resources
                if config.DEPLOYMENT["external_mode"]:
                    sanity_helpers = SanityExternalCluster()
                else:
                    sanity_helpers = Sanity()
                sanity_helpers.health_check()
                sanity_helpers.delete_resources()
                # Verify ceph health
                log.info("Verifying ceph health after deployment")
                assert ceph_health_check(tries=10, delay=30)

    if teardown:
        log.info("Cluster will be destroyed during teardown part of this test.")
Ejemplo n.º 6
0
class TestCouchBaseNodeDrain(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """

    @pytest.fixture()
    def cb_setup(self, couchbase_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_factory_fixture(
            replicas=3, run_in_bg=True, skip_analyze=True
        )

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    def test_run_couchbase_node_drain(self, cb_setup, node_type="master"):
        """
        Test couchbase workload with node drain
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check()
class TestCouchBasePodRespin(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """
    @pytest.fixture()
    def cb_setup(self, couchbase_new_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_new_factory_fixture(replicas=3,
                                                run_in_bg=True,
                                                skip_analyze=True)
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["pod_name"],
        argvalues=[
            pytest.param(*["osd"], marks=pytest.mark.polarion_id("OCS-780")),
            pytest.param(*["mon"], marks=pytest.mark.polarion_id("OCS-779")),
            pytest.param(*["mgr"], marks=pytest.mark.polarion_id("OCS-781")),
            pytest.param(*["couchbase"],
                         marks=pytest.mark.polarion_id("OCS-786")),
        ],
    )
    def test_run_couchbase_respin_pod(self, cb_setup, pod_name):
        log.info(f"Respin Ceph pod {pod_name}")

        if pod_name == "couchbase":
            self.cb.respin_couchbase_app_pod()
        else:
            disruption = Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check(tries=40)
Ejemplo n.º 8
0
class TestRegistryPodRespin(E2ETest):
    """
    Test to run svt workload for pushing
    images to registry and with Ceph pods respin
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def setup(self, request):
        """
        Setup and clean up the namespace
        """

        self.project_name = "test"
        ocp_obj = ocp.OCP(kind=constants.NAMESPACES)
        ocp_obj.new_project(project_name=self.project_name)

        def finalizer():
            log.info("Clean up and remove namespace")
            ocp_obj.exec_oc_cmd(command=f"delete project {self.project_name}")

            # Reset namespace to default
            ocp.switch_to_default_rook_cluster_project()
            ocp_obj.wait_for_delete(resource_name=self.project_name)

        request.addfinalizer(finalizer)

    @pytest.mark.parametrize(
        argnames=["pod_name"],
        argvalues=[
            pytest.param(*["mon"], marks=pytest.mark.polarion_id("OCS-1797")),
            pytest.param(*["osd"], marks=pytest.mark.polarion_id("OCS-1798")),
            pytest.param(*["mgr"], marks=pytest.mark.polarion_id("OCS-1799")),
            pytest.param(*["mds"], marks=pytest.mark.polarion_id("OCS-1790")),
        ],
    )
    def test_registry_respin_pod(self, pod_name):
        """
        Test registry workload when backed by OCS respin of ceph pods
        """

        # Respin relevant pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f"{pod_name}")
        disruption.delete_resource()

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Validate image exists in registries path
        validate_image_exists()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)
Ejemplo n.º 9
0
class TestPgSQLPodRespin(E2ETest):
    """
    Test running PGSQL and with Ceph pods respin
    """
    @pytest.fixture()
    def pgsql_setup(self, pgsql):
        """
        PGSQL test setup
        """
        # Deployment of postgres database
        pgsql.setup_postgresql(replicas=3)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["transactions", "pod_name"],
        argvalues=[
            pytest.param(*[3600, "mon"],
                         marks=pytest.mark.polarion_id("OCS-802")),
            pytest.param(*[3600, "osd"],
                         marks=pytest.mark.polarion_id("OCS-803")),
            pytest.param(*[3600, "mgr"],
                         marks=pytest.mark.polarion_id("OCS-804")),
            pytest.param(*[3600, "postgres"],
                         marks=pytest.mark.polarion_id("OCS-809")),
        ],
    )
    @pytest.mark.usefixtures(pgsql_setup.__name__)
    def test_run_pgsql_respin_pod(self, pgsql, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)
        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization(adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Respin relevant pod
        if pod_name == "postgres":
            pgsql.respin_pgsql_app_pod()
        else:
            log.info(f"Respin Ceph pod {pod_name}")
            disruption = disruption_helpers.Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
Ejemplo n.º 10
0
class TestPgSQLNodeReboot(E2ETest):
    """
    Test running PGSQL and with Ceph pods respin
    """
    @pytest.fixture()
    def pgsql_setup(self, pgsql):
        """
        PGSQL test setup
        """
        # Deployment of postgres database
        pgsql.setup_postgresql(replicas=3)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["transactions", "pod_name"],
        argvalues=[
            pytest.param(*[3600, "osd"],
                         marks=pytest.mark.polarion_id("OCS-801"))
        ],
    )
    @pytest.mark.usefixtures(pgsql_setup.__name__)
    def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Select a node where pgbench is not running and reboot
        osd_nodes_list = get_osd_running_nodes()
        node_list = pgsql.filter_pgbench_nodes_from_nodeslist(osd_nodes_list)

        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])
        log.info(f"Selected node {node_1} for reboot operation")

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Restart relevant node
        nodes.restart_nodes(node_1)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
class TestMonitoringBackedByOCS(E2ETest):
    """
    Test cases to validate monitoring backed by OCS
    """

    num_of_pvcs = 5
    pvc_size = 5

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady or unschedulable,
        for situations in which the test failed in between restarting
        or scheduling those nodes

        """
        def finalizer():

            # Validate all nodes are schedulable
            scheduling_disabled_nodes = [
                n.name for n in get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_READY_SCHEDULING_DISABLED
            ]
            if scheduling_disabled_nodes:
                schedule_nodes(scheduling_disabled_nodes)

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")

            assert prometheus_health_check(), "Prometheus health is degraded"

        request.addfinalizer(finalizer)

    @pytest.fixture()
    def pods(self, multi_pvc_factory, dc_pod_factory):
        """
        Prepare multiple dc pods for the test

        Returns:
            list: Pod instances

        """
        sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL)

        pvc_objs = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            storageclass=sc,
            size=self.pvc_size,
            num_of_pvc=self.num_of_pvcs,
        )

        pod_objs = []
        for pvc_obj in pvc_objs:
            pod_objs.append(dc_pod_factory(pvc=pvc_obj))

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pod_objs:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        return pod_objs

    @pytest.mark.polarion_id("OCS-576")
    def test_monitoring_after_restarting_prometheus_pod(self, pods):
        """
        Test case to validate prometheus pod restart
        should not have any functional impact

        """

        # Get the prometheus pod
        prometheus_pod_obj = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=["prometheus"])

        for pod_object in prometheus_pod_obj:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_object.get()
            pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"]

            # Restart the prometheus pod
            pod_object.delete(force=True)
            pod_obj = ocp.OCP(kind=constants.POD,
                              namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert pod_obj.wait_for_resource(condition="Running",
                                             selector="app=prometheus",
                                             timeout=60)

            # Check the same pvc is mounted on new pod
            pod_info = pod_object.get()
            assert (
                pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]
                ["claimName"] in pvc_name
            ), f"Old pvc not found after restarting the prometheus pod {pod_object.name}"

        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

    @pytest.mark.polarion_id("OCS-579")
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=["prometheus"])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"]

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj["spec"]["nodeName"]

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition="Running", selector="app=prometheus", timeout=180
            ), "One or more prometheus pods are not in running state"

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info["spec"]["nodeName"]
            assert (new_node not in prometheus_node
                    ), "Promethues pod not re-spinned on new node"
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert (
                pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]
                ["claimName"] in pvc_name
            ), f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"

            # Validate the prometheus health is ok
            assert prometheus_health_check(
            ), "Prometheus cluster health is not OK"

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

    @pytest.mark.polarion_id("OCS-580")
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ["mgr", "mon", "osd"]
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

    @pytest.mark.polarion_id("OCS-605")
    def test_monitoring_when_osd_down(self, pods):
        """
        Test case to validate monitoring when osd is down

        """

        # Get osd pods
        osd_pod_list = pod.get_osd_pods()

        # Make one of the osd down(first one)
        resource_name = osd_pod_list[0].get().get("metadata").get("name")
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=0)

        # Validate osd is down
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        pod_obj.wait_for_delete(resource_name=resource_name), (
            f"Resources is not deleted {resource_name}")

        # Check for the created pvc metrics when osd is down
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

        # Make osd up which was down
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=1)

        # Validate osd is up and ceph health is ok
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-606")
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, nodes, pods):
        """
        Test case to validate when the prometheus pod is down and its
        interaction with prometheus

        """

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=["prometheus"])

        for pod_obj in pod_obj_list:
            # Get the node where the prometheus pod is hosted
            pod_node_obj = pod.get_pod_node(pod_obj)

            # Make one of the node down where the prometheus pod is hosted
            nodes.restart_nodes([pod_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check all the prometheus pods are up
        for pod_obj in pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check for the created pvc metrics after restarting node where prometheus pod is hosted
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            log.info(
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected"
            )

    @pytest.mark.polarion_id("OCS-709")
    def test_monitoring_after_rebooting_master_node(self, nodes, pods):
        """
        Test case to validate rebooting master node shouldn't delete
        the data collected on prometheus pod

        """

        # Get the master node list
        master_nodes = get_nodes(node_type="master")

        # Reboot one after one master nodes
        for node in master_nodes:
            nodes.restart_nodes([node], wait=False)

            # Wait some time after rebooting master
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            wait_for_nodes_status_and_prometheus_health_check(pods)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-710")
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        retry((CommandFailed, ResourceWrongStatusException),
              tries=20,
              delay=15)(wait_for_nodes_status())

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition="Running",
                                         selector="app=rook-ceph-mgr",
                                         timeout=600)
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

    @pytest.mark.polarion_id("OCS-711")
    @skipif_aws_i3
    def test_monitoring_shutdown_and_recovery_prometheus_node(
            self, nodes, pods):
        """
        Test case to validate whether shutdown and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get all prometheus pods
        prometheus_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=["prometheus"])

        for prometheus_pod_obj in prometheus_pod_obj_list:
            # Get the node where the prometheus pod is hosted
            prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj)

            # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted
            nodes.stop_nodes([prometheus_node_obj])

            waiting_time = 20
            log.info(f"Waiting for {waiting_time} seconds")
            time.sleep(waiting_time)

            nodes.start_nodes(nodes=[prometheus_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check all the prometheus pods are up
        for pod_obj in prometheus_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for the created pvc metrics after shutdown and recovery of prometheus nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"

    @pytest.mark.polarion_id("OCS-638")
    def test_monitoring_delete_pvc(self):
        """
        Test case to validate whether delete pvcs+configmap and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get 'cluster-monitoring-config' configmap
        ocp_configmap = ocp.OCP(namespace=constants.MONITORING_NAMESPACE,
                                kind="configmap")
        configmap_dict = ocp_configmap.get(
            resource_name="cluster-monitoring-config")
        dir_configmap = tempfile.mkdtemp(prefix="configmap_")
        yaml_file = f"{dir_configmap}/configmap.yaml"
        templating.dump_data_to_temp_yaml(configmap_dict, yaml_file)

        # Get prometheus and alertmanager pods
        prometheus_alertmanager_pods = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=["prometheus", "alertmanager"],
        )

        # Get all pvc on monitoring namespace
        pvc_objs_list = pvc.get_all_pvc_objs(
            namespace=constants.MONITORING_NAMESPACE)

        # Delete configmap
        ocp_configmap.delete(resource_name="cluster-monitoring-config")

        # Delete all pvcs on monitoring namespace
        pvc.delete_pvcs(pvc_objs=pvc_objs_list)

        # Check all the prometheus and alertmanager pods are up
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Create configmap
        ocp_configmap.create(yaml_file=dir_configmap)

        # Check all the PVCs are up
        for pvc_obj in pvc_objs_list:
            wait_for_resource_state(resource=pvc_obj,
                                    state=constants.STATUS_BOUND,
                                    timeout=180)

        # Check all the prometheus and alertmanager pods are up
        # and pvc are mounted on monitoring pods
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)
            mount_point = pod_obj.exec_cmd_on_pod(
                command="df -kh",
                out_yaml_format=False,
            )
            assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}"
        log.info("Verified all pvc are mounted on monitoring pods")

        # Validate the prometheus health is ok
        assert prometheus_health_check(), "Prometheus cluster health is not OK"

    @pytest.mark.polarion_id("OCS-1535")
    def test_monitoring_shutdown_mgr_pod(self, pods):
        """
        Montoring backed by OCS, bring mgr down(replica: 0) for some time
        and check ceph related metrics
        """
        # Check ceph metrics available
        assert (
            check_ceph_metrics_available()
        ), "failed to get results for some metrics before Downscaling deployment mgr to 0"

        # Get pod mge name and mgr deployment
        oc_deployment = ocp.OCP(kind=constants.DEPLOYMENT,
                                namespace=ROOK_CLUSTER_NAMESPACE)
        mgr_deployments = oc_deployment.get(
            selector=constants.MGR_APP_LABEL)["items"]
        mgr = mgr_deployments[0]["metadata"]["name"]
        pod_mgr_name = get_pod_name_by_pattern(
            pattern=mgr, namespace=ROOK_CLUSTER_NAMESPACE)

        log.info(f"Downscaling deployment {mgr} to 0")
        oc_deployment.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}")

        log.info(f"Wait for a mgr pod {pod_mgr_name[0]} to be deleted")
        oc_pod = ocp.OCP(kind=constants.POD, namespace=ROOK_CLUSTER_NAMESPACE)
        oc_pod.wait_for_delete(resource_name=pod_mgr_name[0])

        log.info(f"Upscaling deployment {mgr} back to 1")
        oc_deployment.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}")

        log.info("Waiting for mgr pod to be reach Running state")
        oc_pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL)

        # Check ceph metrics available
        assert (
            check_ceph_metrics_available()
        ), "failed to get results for some metrics after Downscaling and Upscaling deployment mgr"
Ejemplo n.º 12
0
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes - Reactive
    """

    threads = []

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()
            ceph_health_check()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(
                *["rbd", "shutdown"],
                marks=[
                    pytest.mark.polarion_id("OCS-2102"),
                    pytest.mark.bugzilla("1845666"),
                ],
            ),
            pytest.param(*["rbd", "terminate"],
                         marks=pytest.mark.polarion_id("OCS-2103")),
            pytest.param(
                *["cephfs", "shutdown"],
                marks=[
                    pytest.mark.polarion_id("OCS-2104"),
                    pytest.mark.bugzilla("1845666"),
                ],
            ),
            pytest.param(*["cephfs", "terminate"],
                         marks=pytest.mark.polarion_id("OCS-2105")),
        ],
    )
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        failure,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Ejemplo n.º 13
0
class TestJenkinsNodeDrain(E2ETest):
    """
    Test running Jenkins and Node Drain
    """

    @pytest.fixture()
    def jenkins_setup(self, jenkins):
        """
        JENKINS test setup
        """
        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

        # Deployment of jenkins
        jenkins.create_ocs_jenkins_template()

    @pytest.mark.parametrize(
        argnames=["node_type", "num_projects", "num_of_builds"],
        argvalues=[
            pytest.param(
                *[WORKER_MACHINE, 4, 3], marks=pytest.mark.polarion_id("OCS-2252")
            ),
            pytest.param(
                *[MASTER_MACHINE, 3, 6], marks=pytest.mark.polarion_id("OCS-2176")
            ),
        ],
    )
    @pytest.mark.usefixtures(jenkins_setup.__name__)
    def test_run_jenkins_drain_node(
        self, jenkins, node_type, num_projects, num_of_builds
    ):
        """

        Test Node Drain jenkins
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Get relevant node
        nodes_drain = jenkins.get_node_name_where_jenkins_pod_not_hosted(
            node_type=node_type, num_of_nodes=1
        )

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        if len(nodes_drain) > 0:
            # Node maintenance - to gracefully terminate all pods on the node
            drain_nodes(nodes_drain)
            # Make the node  schedulable again
            schedule_nodes(nodes_drain)

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Reactive
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            # Verify OSD encrypted
            if config.ENV_DATA.get("encryption_at_rest"):
                osd_encryption_verification()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(*["rbd", "power off"],
                         marks=pytest.mark.polarion_id("OCS-2118")),
            pytest.param(*["rbd", "network failure"],
                         marks=pytest.mark.polarion_id("OCS-2120")),
            pytest.param(*["cephfs", "power off"],
                         marks=pytest.mark.polarion_id("OCS-2119")),
            pytest.param(
                *["cephfs", "network failure"],
                marks=pytest.mark.polarion_id("OCS-2121"),
            ),
        ],
    )
    def test_node_replacement_reactive_aws_ipi(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        failure,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(
                f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(annotation=annotation,
                                          machine_name=machine_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes))
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind="node")
        node_obj.add_label(resource_name=new_spun_node[0],
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label")

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Ejemplo n.º 15
0
def osd_device_replacement(nodes):
    """
    Replacing randomly picked osd device
    Args:
        node (OCS): The OCS object representing the node
    """
    logger.info("Picking a PV which to be deleted from the platform side")
    osd_pvs = get_deviceset_pvs()
    osd_pv = random.choice(osd_pvs)
    osd_pv_name = osd_pv.name
    # get the claim name
    logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
    claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

    # Get the backing volume name
    logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
    backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

    # Get the corresponding PVC
    logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
    osd_pvcs = get_deviceset_pvcs()
    osd_pvcs_count = len(osd_pvcs)
    osd_pvc = [
        ds for ds in osd_pvcs
        if ds.get().get("metadata").get("name") == claim_name
    ][0]

    # Get the corresponding OSD pod and ID
    logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
    osd_pods = get_osd_pods()
    osd_pods_count = len(osd_pods)
    osd_pod = [
        osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get(
            "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
    ][0]
    logger.info(f"OSD_POD {osd_pod.name}")
    osd_id = get_osd_pod_id(osd_pod)

    # Get the node that has the OSD pod running on
    logger.info(
        f"Getting the node that has the OSD pod {osd_pod.name} running on")
    osd_node = get_pod_node(osd_pod)
    ocp_version = get_ocp_version()
    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

    # Get the corresponding OSD deployment
    logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
    osd_deployment = [
        osd_pod for osd_pod in get_osd_deployments()
        if osd_pod.get().get("metadata").get("labels").get(
            constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
    ][0]
    osd_deployment_name = osd_deployment.name

    # Delete the volume from the platform side
    logger.info(f"Deleting {backing_volume} from the platform side")
    nodes.detach_volume(backing_volume, osd_node)

    # Scale down OSD deployment
    logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
    ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
    ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

    # Force delete OSD pod if necessary
    osd_pod_name = osd_pod.name
    logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
    try:
        osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
    except TimeoutError:
        osd_pod.delete(force=True)
        osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

    # Run ocs-osd-removal job
    osd_removal_job = run_osd_removal_job([osd_id])
    assert osd_removal_job, "ocs-osd-removal failed to create"
    is_completed = verify_osd_removal_job_completed_successfully(osd_id)
    assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
    logger.info("ocs-osd-removal-job completed successfully")

    osd_pvc_name = osd_pvc.name

    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        # Delete the OSD prepare job
        logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name,
                                            timeout=120)

        # Delete the OSD PVC
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Delete the OSD deployment
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name,
                                           timeout=120)
    else:
        # If ocp version is '4.6' and above the osd removal job should
        # delete the OSD prepare job, OSD PVC, OSD deployment
        # We just need to verify the old PV is in the expected status
        logger.info(
            f"Verify that the old PV '{osd_pv_name}' is in the expected status"
        )
        if cluster.is_lso_cluster():
            expected_old_pv_statuses = [constants.STATUS_RELEASED]
        else:
            expected_old_pv_statuses = [
                constants.STATUS_RELEASED,
                constants.STATUS_FAILED,
            ]

        assert (osd_pv.ocp.get_resource_status(osd_pv_name)
                in expected_old_pv_statuses), logger.warning(
                    f"The old PV '{osd_pv_name}' is not in "
                    f"the expected statuses: {expected_old_pv_statuses}")

    # Delete PV
    logger.info(f"Verifying deletion of PV {osd_pv_name}")
    try:
        osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
    except TimeoutError:
        osd_pv.delete()
        osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

    # If we use LSO, we need to create and attach a new disk manually
    if cluster.is_lso_cluster():
        node.add_disk_to_node(osd_node)

    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(
            f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

    # Delete the OSD removal job
    logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
    is_deleted = delete_osd_removal_job(osd_id)
    assert is_deleted, "Failed to delete ocs-osd-removal-job"
    logger.info("ocs-osd-removal-job deleted successfully")

    timeout = 600
    # Wait for OSD PVC to get created and reach Bound state
    logger.info(
        "Waiting for a new OSD PVC to get created and reach Bound state")
    assert osd_pvc.ocp.wait_for_resource(
        timeout=timeout,
        condition=constants.STATUS_BOUND,
        selector=constants.OSD_PVC_GENERIC_LABEL,
        resource_count=osd_pvcs_count,
    ), (f"Cluster recovery failed after {timeout} seconds. "
        f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
        f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
        )
    # Wait for OSD pod to get created and reach Running state
    logger.info(
        "Waiting for a new OSD pod to get created and reach Running state")
    assert osd_pod.ocp.wait_for_resource(
        timeout=timeout,
        condition=constants.STATUS_RUNNING,
        selector=constants.OSD_APP_LABEL,
        resource_count=osd_pods_count,
    ), (f"Cluster recovery failed after {timeout} seconds. "
        f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
        f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
        )

    # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
    # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
    if Version.coerce(ocp_version) >= Version.coerce("4.6"):
        silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
            osd_pod_name)
        if not silence_osd_crash:
            logger.info("Didn't find ceph osd crash warning")
    sanity_helpers = Sanity()
    sanity_helpers.health_check(tries=120)
Ejemplo n.º 16
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    def detach_volume_and_wait_for_attach(self, nodes, data_volume,
                                          worker_node):
        """
        Detach an EBS volume from an AWS instance and wait for the volume
        to be re-attached

        Args:
            node (OCS): The OCS object representing the node
            data_volume (Volume): The ec2 volume to delete
            worker_node (OCS): The OCS object of the EC2 instance

        """
        try:
            # Detach volume (logging is done inside the function)
            nodes.detach_volume(data_volume, worker_node)
        except AWSTimeoutException as e:
            if "Volume state: in-use" in e:
                logger.info(
                    f"Volume {data_volume} re-attached successfully to worker"
                    f" node {worker_node}")
            else:
                raise
        else:
            """
            Wait for worker volume to be re-attached automatically
            to the node
            """
            assert nodes.wait_for_volume_attach(data_volume), (
                f"Volume {data_volume} failed to be re-attached to worker "
                f"node {worker_node}")

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

            # Verify OSD encrypted
            if config.ENV_DATA.get("encryption_at_rest"):
                osd_encryption_verification()

            logger.info("Clear crash warnings and osd removal leftovers")
            clear_crash_warning_and_osd_removal_leftovers()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    @bugzilla("1825675")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory,
                                         bucket_factory, rgw_bucket_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Wait for the volumes to be re-attached back to the worker node
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be
          unhealthy) by creating resources and running IO
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume and wait for the volume to attach
        self.detach_volume_and_wait_for_attach(nodes, data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    @skipif_ibm_cloud
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory, bucket_factory,
                                          rgw_bucket_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            "worker": nodes.get_node_by_attached_volume(vol),
            "volume": vol
        } for vol in data_volumes]
        for worker_and_volume in workers_and_volumes:
            # Detach volume and wait for the volume to attach
            self.detach_volume_and_wait_for_attach(nodes,
                                                   worker_and_volume["volume"],
                                                   worker_and_volume["worker"])
        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume["worker"]
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

    @bugzilla("1830702")
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory, bucket_factory,
                                           rgw_bucket_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        osd_operations.osd_device_replacement(nodes)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
Ejemplo n.º 17
0
class TestCheckPodsAfterNodeFailure(ManageTest):
    """
    Test check pods status after a node failure event.

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """

        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """
        def finalizer():
            not_ready_nodes = get_nodes_in_statuses([constants.NODE_NOT_READY])
            if not_ready_nodes:
                not_ready_node_names = [n.name for n in not_ready_nodes]
                log.warning(
                    f"We have nodes in not ready statuses: {not_ready_node_names}. "
                    f"Starting the nodes that are not ready...")
                nodes.restart_nodes(not_ready_nodes)
                wait_for_nodes_status(node_names=not_ready_node_names)
            else:
                log.info("All the nodes are in 'Ready' state")

        request.addfinalizer(finalizer)

    def test_check_pods_status_after_node_failure(self, nodes,
                                                  node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(
            node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name],
                              status=constants.NODE_NOT_READY)
        log.info(
            f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status"
        )

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout)
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info(
            "Check the rook ceph pods are in 'Running' or 'Completed' state")
        timeout = 480
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node,
            timeout=timeout,
            sleep=30)
        assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds"

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name
            for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name
            for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set)
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(rook_ceph_pod_names_set -
                                       new_node_osd_mon_id_names_set)

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20)
        assert (are_new_pods_running
                ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")
        log.info(f"Starting the node '{node_name}' again...")
        nodes.start_nodes(nodes=[ocs_node])
        wait_for_nodes_status(node_names=[node_name])

        log.info(
            "Waiting for all the pods to be running and cluster health to be OK..."
        )
        wait_for_pods_to_be_running(timeout=600)
        self.sanity_helpers.health_check(tries=40)
class TestOCSWorkerNodeShutdown(ManageTest):
    """
    Test case validate both the MDS pods rbd and cephfs plugin Provisioner
    pods and not running on same node post shutdown and recovery

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """

        self.sanity_helpers = Sanity()

    @pytest.mark.polarion_id("OCS-2315")
    @skipif_ibm_cloud
    @skipif_managed_service
    def test_check_pod_status_after_two_nodes_shutdown_recovery(
            self, nodes, node_restart_teardown):
        """
        Test case to check MDS pods rbd and cephfs plugin Provisioner
        pods not running on same node post shutdown and recovery node

        """

        # Get MDS, rbd, cephfs plugin provisioner pods running nodes
        # before shutdown

        log.info("Check pod nodes before nodes shutdown")
        list_of_nodes_running_pods(selector="rook-ceph-mds")

        list_of_nodes_running_pods(selector="csi-rbdplugin-provisioner")

        list_of_nodes_running_pods(selector="csi-cephfsplugin-provisioner")

        # Get the node list
        node = get_nodes(node_type="worker", num_of_nodes=2)

        # Shutdown 2 worker nodes for 10 mins
        nodes.stop_nodes(nodes=node)

        waiting_time = 600
        log.info(f"Waiting for {waiting_time} seconds")
        time.sleep(waiting_time)

        nodes.start_nodes(nodes=node)

        # Validate all nodes are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=30,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()
        wait_for_storage_pods()

        # Get MDS, rbd & cephfs plugin provisioner pods running
        # nodes post-recovery
        mds_running_nodes_after_recovery = list_of_nodes_running_pods(
            selector="rook-ceph-mds")

        rbd_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods(
            selector="csi-rbdplugin-provisioner")

        cephfs_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods(
            selector="csi-cephfsplugin-provisioner")

        assert len(set(mds_running_nodes_after_recovery)) == len(
            mds_running_nodes_after_recovery
        ), "MDS running on same node, Not expected!!!"
        log.info("MDS pods not running on same node")

        assert len(set(rbd_provisioner_running_nodes_after_recovery)) == len(
            rbd_provisioner_running_nodes_after_recovery
        ), "rbd plugin provisioner pods running on Same node, Not expected"
        log.info("RBD plugin provisioner pods not running on same node")

        assert len(
            set(cephfs_provisioner_running_nodes_after_recovery)
        ) == len(
            cephfs_provisioner_running_nodes_after_recovery
        ), "cephfs plugin provisioner pods running on Same node, Not expected"
        log.info("CEPHFS plugin provisioner pods not running on same node")
Ejemplo n.º 19
0
class TestRegistryRebootNode(E2ETest):
    """
    Test to run svt workload for pushing
    images to registry when node is rebooted
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def setup(self, project_factory, node_restart_teardown):
        """
        Setup and clean up
        """

        self.project_name = "test"
        project_factory(project_name=self.project_name)

    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*[MASTER_MACHINE],
                         marks=pytest.mark.polarion_id("OCS-1803")),
            pytest.param(*[WORKER_MACHINE],
                         marks=pytest.mark.polarion_id("OCS-1795")),
        ],
    )
    def test_registry_reboot_node(self, node_type, nodes):
        """
        Test registry workload when backed by OCS and reboot node
        """

        # Get the node list
        node = get_nodes(node_type, num_of_nodes=1)

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Validate image exists in registries path
        validate_image_exists()

        # Reboot one node
        nodes.restart_nodes(node, wait=False)

        # Validate all nodes and services are in READY state and up
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_cluster_connectivity)(tries=400)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status)(timeout=900)

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists()

    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*[MASTER_MACHINE],
                         marks=pytest.mark.polarion_id("OCS-1802")),
            pytest.param(*[WORKER_MACHINE],
                         marks=pytest.mark.polarion_id("OCS-1804")),
        ],
    )
    def test_registry_rolling_reboot_node(self, node_type, nodes):
        """
        Test registry workload when backed by OCS and reboot node one by one
        """

        # Get the node list
        node_list = get_nodes(node_type)

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Validate image exists in registries path
        validate_image_exists()

        for node in node_list:

            # Reboot node
            log.info(node.name)
            nodes.restart_nodes([node], wait=False)

            # Wait some time after rebooting node
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate all nodes and services are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_cluster_connectivity)(tries=400)
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists()
class TestPvcCreationAfterDelMonService(E2ETest):
    """
    Tests to verify PVC creation after deleting
    mon services manually
    """

    @bugzilla("1858195")
    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(
                constants.CEPHBLOCKPOOL, marks=pytest.mark.polarion_id("OCS-2495")
            ),
            pytest.param(
                constants.CEPHFILESYSTEM, marks=pytest.mark.polarion_id("OCS-2494")
            ),
        ],
    )
    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            svc_name = svc["metadata"]["name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            mon_info = del_obj.get(resource_name=svc_name)
            del_obj.delete(resource_name=svc_name)

            # Delete pvc
            if is_lso_cluster():
                mon_data_path = f"/var/lib/rook/mon-{mon_id}"
                mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][
                    "kubernetes.io/hostname"
                ]
                log.info(f"Delete the directory `{mon_data_path}` from {mon_node}")
                cmd = f"rm -rf {mon_data_path}"
                ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
                ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd])
            else:
                log.info("Delete mon PVC")
                pvc_name = svc["metadata"]["labels"]["pvc_name"]
                pvc_obj = OCP(
                    kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
                )
                pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(
                kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            svc_obj.delete(resource_name=svc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS
            )
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "")
                if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1
                else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""
                )
            )
            new_data["data"] = ",".join(
                [
                    value
                    for value in new_data["data"].split(",")
                    if f"{mon_id}=" not in value
                ]
            )
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1
                else new_data["mapping"].replace(f',"{mon_id}":null', "")
            )
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=1
            ), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(f"Waiting for {sleep_time} seconds before deleting another mon")
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}"
        )
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")

    @pytest.fixture()
    def validate_all_mon_svc_are_up_at_teardown(self, request):
        """
        Verifies all mon services are running

        """

        # Get all mon services
        mon_svc_list = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods_list = get_mon_pods()

        def finalizer():

            # Validate all mon services are running
            if len(mon_svc_list) != len(
                get_services_by_label(
                    label=constants.MON_APP_LABEL,
                    namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                )
            ):

                # Restart the rook-operator pod
                operator_pod_obj = get_operator_pods()
                delete_pods(pod_objs=operator_pod_obj)
                POD_OBJ.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.OPERATOR_LABEL,
                )

                # Wait till all mon services are up
                for svc_list in TimeoutSampler(
                    1200,
                    len(mon_svc_list),
                    get_services_by_label,
                    constants.MON_APP_LABEL,
                    constants.OPENSHIFT_STORAGE_NAMESPACE,
                ):
                    try:
                        if len(svc_list) == len(mon_svc_list):
                            log.info("All expected mon services are up")
                            break
                    except IndexError:
                        log.error(
                            f"All expected mon services are not up only found :{svc_list}. "
                            f"Expected: {mon_svc_list}"
                        )

                # Wait till all mon pods running
                POD_OBJ.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=len(mon_pods_list),
                    timeout=600,
                    sleep=3,
                )

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

        request.addfinalizer(finalizer)

    @bugzilla("1969733")
    @pytest.mark.polarion_id("OCS-2611")
    def test_del_mon_svc(
        self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown
    ):
        """
        Test to verify same mon comes up and running
        after deleting mon services manually and joins the quorum

        1. Delete the mon services
        2. Restart the rook operator
        3. Make sure all mon pods are running,
        and same service or endpoints are running
        4. Make sure ceph health Ok and storage pods are running
        5. Create PVC, should succeeded.

        """

        self.sanity_helpers = Sanity()

        # Get all mon services
        mon_svc_before = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()

        # Delete the mon services one by one
        svc_obj = OCP(
            kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
        )
        mon_svc_ip_before = []
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            mon_svc_ip_before.append(svc["spec"]["clusterIP"])
            log.info(f"Delete mon service {svc_name}")
            svc_obj.delete(resource_name=svc_name)
            # Verify mon services deleted
            svc_obj.wait_for_delete(resource_name=svc_name)

        # Restart the rook-operator pod
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL
        )

        # Verify same mon services are created again
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            svc_obj.check_resource_existence(
                should_exist=True, timeout=300, resource_name=svc_name
            )
        log.info("Same old mon services are recreated")

        # Validate all mons are running
        log.info("Validate all mons are up and running")
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=len(mon_pods),
            timeout=600,
            sleep=3,
        )

        # Validate same mon services are running
        log.info("Validate same mon services are running")
        mon_svc_after = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after]
        assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, (
            "Different mon services are running. "
            f"Before mon services list: {mon_svc_ip_before}, "
            f"After mon services list: {mon_svc_ip_after}"
        )
        log.info("Same old mon services are running and all mons are in running state")

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Create and delete resources
        self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
class TestCouchBaseNodeReboot(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """
    @pytest.fixture()
    def cb_setup(self, couchbase_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_factory_fixture(replicas=3,
                                            run_in_bg=True,
                                            skip_analyze=True)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["pod_name_of_node"],
        argvalues=[
            pytest.param(*["osd"], marks=pytest.mark.polarion_id("OCS-776")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-783")),
            pytest.param(*["couchbase"],
                         marks=pytest.mark.polarion_id("OCS-776")),
        ],
    )
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type="master",
                                                   print_table=True)

        if pod_name_of_node == "couchbase":
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == "osd":
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == "master":
            master_node = get_nodes(pod_name_of_node, num_of_nodes=1)

        # Restart relevant node
        if pod_name_of_node == "master":
            nodes.restart_nodes(master_node, wait=False)
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)
        else:
            restart_node = get_node_objs(node_list[random.randint(
                0,
                len(node_list) - 1)])
            nodes.restart_nodes(restart_node)

        # Validate all nodes and services are in READY state and up

        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(wait_for_nodes_status(timeout=1800))
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        retry((CommandFailed), tries=60,
              delay=15)(bg_handler.wait_for_bg_operations(bg_ops,
                                                          timeout=3600))
        self.sanity_helpers.health_check(tries=40)
Ejemplo n.º 22
0
class TestNodesRestart(ManageTest):
    """
    Test ungraceful cluster shutdown
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """
        def finalizer():
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(finalizer)

    @pytest.mark.parametrize(
        argnames=["force"],
        argvalues=[
            pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")),
            pytest.param(
                *[False],
                marks=[
                    pytest.mark.polarion_id("OCS-895"), cloud_platform_required
                ],
            ),
        ],
    )
    def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force,
                           bucket_factory, rgw_bucket_factory):
        """
        Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs)

        """
        ocp_nodes = get_node_objs()
        nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force)
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

    @bugzilla("1754287")
    @pytest.mark.polarion_id("OCS-2015")
    def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory,
                                   bucket_factory, rgw_bucket_factory):
        """
        Test restart nodes one after the other and check health status in between

        """
        ocp_nodes = get_node_objs()
        for node in ocp_nodes:
            nodes.restart_nodes(nodes=[node], wait=False)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

    @pytest.mark.parametrize(
        argnames=["interface", "operation"],
        argvalues=[
            pytest.param(*["rbd", "create_resources"],
                         marks=pytest.mark.polarion_id("OCS-1138")),
            pytest.param(*["rbd", "delete_resources"],
                         marks=pytest.mark.polarion_id("OCS-1241")),
            pytest.param(
                *["cephfs", "create_resources"],
                marks=pytest.mark.polarion_id("OCS-1139"),
            ),
            pytest.param(
                *["cephfs", "delete_resources"],
                marks=pytest.mark.polarion_id("OCS-1242"),
            ),
        ],
    )
    def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        interface,
        operation,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test PV provisioning under degraded state -
        stop the node that has the provisioner pod running on

        OCS-1138:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        """
        if operation == "delete_resources":
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)

        provisioner_pods = None
        # Get the provisioner pod according to the interface
        if interface == "rbd":
            provisioner_pods = pod.get_rbdfsplugin_provisioner_pods()
        elif interface == "cephfs":
            provisioner_pods = pod.get_cephfsplugin_provisioner_pods()
        provisioner_pod = provisioner_pods[0]

        # Making sure that the node is not running the rook operator pod:
        provisioner_node = pod.get_pod_node(provisioner_pod)
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get("metadata").get(
                "name") == provisioner_node.get().get("metadata").get("name"):
            provisioner_pod = provisioner_pods[1]

        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get().get("metadata").get(
            "name")
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Stopping the nodes
        nodes.stop_nodes(nodes=[provisioner_node])

        # Wait for the provisioner pod to get to running status
        selector = (constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if
                    (interface == "rbd") else
                    constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL)

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING,
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for {interface} provisioner pod to reach status Running")
        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2,
        ), f"{interface} provisioner pod failed to reach status Running"

        logger.info(f"{interface} provisioner pod has reached status Running")
        if operation == "create_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)
        elif operation == "delete_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[provisioner_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()

    @pytest.mark.parametrize(
        argnames=["operation"],
        argvalues=[
            pytest.param(*["create_resources"],
                         marks=[pytest.mark.polarion_id("OCS-2016")]),
            pytest.param(*["delete_resources"],
                         marks=[pytest.mark.polarion_id("OCS-2017")]),
        ],
    )
    def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        operation,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test PV provisioning under degraded state -
        stop the node that has the rook operator pod running on

        OCS-2016:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-2017:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources
        - Start the worker node
        - Check cluster and Ceph health
        """
        if operation == "delete_resources":
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)

        rook_operator_pods = pod.get_operator_pods()
        rook_operator_pod = rook_operator_pods[0]

        rook_operator_pod_name = rook_operator_pod.name
        logger.info(f"rook operator pod found: {rook_operator_pod_name}")

        # Get the node name that has the rook operator pod running on
        operator_node = pod.get_pod_node(rook_operator_pod)
        operator_node_name = operator_node.get().get("metadata").get("name")
        logger.info(
            f"{rook_operator_pod_name} pod is running on node {operator_node_name}"
        )

        # Stopping the node
        nodes.stop_nodes(nodes=[operator_node])

        # Wait for the rook operator pod to get to running status
        selector = constants.OPERATOR_LABEL

        # Wait for the rook operator pod to reach Terminating status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Terminating"
        )
        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=rook_operator_pod_name,
            condition=constants.STATUS_TERMINATING,
        ), "rook operator pod failed to reach status Terminating"
        logger.info(
            f"Pod {rook_operator_pod_name} has reached status Terminating")

        # Wait for the rook operator pod to be started and reach running status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Running"
        )

        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=1,
        ), "rook operator pod failed to reach status Running"
        logger.info("rook operator pod has reached status Running")

        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        if operation == "create_resources":
            # Cluster validation (resources creation and IO running)

            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)
        elif operation == "delete_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[operator_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()

    @skipif_no_lso
    @bugzilla("1873938")
    @pytest.mark.polarion_id("OCS-2448")
    def test_pv_after_reboot_node(self, nodes):
        """
        Verify unexpected PV is not created after node reboot on LSO cluster

        """
        pv_before_reset = get_pv_names()
        worker_nodes = get_nodes(node_type=constants.WORKER_MACHINE,
                                 num_of_nodes=3)
        ocp_obj = OCP(kind=constants.PV)
        for worker_node in worker_nodes:
            # Restart one worker node
            nodes.restart_nodes(nodes=[worker_node], wait=True)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            logger.info(f"Verify PV after reboot {worker_node}")
            pv_after_reset = get_pv_names()
            pv_diff = set(pv_after_reset) - set(pv_before_reset)
            pv_new = []
            for pv in pv_diff:
                pv_obj = ocp_obj.get(resource_name=pv)
                if pv_obj["spec"]["storageClassName"] == "localblock":
                    pv_new.append(pv)
            assert (
                not pv_new
            ), f"Unexpected PV {pv_new} created after reboot {worker_node}"
        logger.info("SUCCESS - No new PV was created.")
Ejemplo n.º 23
0
class TestPgSQLNodeReboot(E2ETest):
    """
    Test running PGSQL and with Ceph pods respin
    """
    @pytest.fixture()
    def pgsql_setup(self, pgsql):
        """
        PGSQL test setup
        """
        # Deployment of postgres database
        pgsql.setup_postgresql(replicas=3)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.usefixtures(pgsql_setup.__name__)
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=5600,
                                  node_type="worker"):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Select a node where pgbench is not running for drain
        typed_nodes = [
            node1.name for node1 in node.get_nodes(node_type=node_type)
        ]
        filter_list = pgsql.filter_pgbench_nodes_from_nodeslist(typed_nodes)
        typed_node_name = filter_list[random.randint(0, len(filter_list) - 1)]
        log.info(f"Selected node {typed_node_name} for node drain operation")

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
class TestNonOCSTaintAndTolerations(E2ETest):
    """
    Test to test non ocs taints on ocs nodes
    and toleration
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        """
        Make sure all nodes are untainted

        """
        def finalizer():
            assert untaint_nodes(
                taint_label="xyz=true:NoSchedule", ), "Failed to untaint"

        request.addfinalizer(finalizer)

    def test_non_ocs_taint_and_tolerations(self):
        """
        Test runs the following steps
        1. Taint ocs nodes with non-ocs taint
        2. Set tolerations on storagecluster, subscription, configmap and ocsinit
        3. Respin all ocs pods and check if it runs on ocs nodes with tolerations
        4. Add Capacity

        """

        # Taint all nodes with non-ocs taint
        ocs_nodes = get_worker_nodes()
        taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule")

        # Add tolerations to the storagecluster
        storagecluster_obj = ocp.OCP(
            resource_name=constants.DEFAULT_CLUSTERNAME,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.STORAGECLUSTER,
        )
        tolerations = (
            '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",'
            '"operator": "Equal", "value": "true"}, '
            '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", '
            '"operator": "Equal", "value": "true"}]}')
        param = (
            f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, '
            f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}')
        storagecluster_obj.patch(params=param, format_type="merge")

        # Add tolerations to the subscription
        sub_list = ocp.get_all_resource_names_of_a_kind(
            kind=constants.SUBSCRIPTION)
        param = (
            '{"spec": {"config":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}}')
        for sub in sub_list:
            sub_obj = ocp.OCP(
                resource_name=sub,
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                kind=constants.SUBSCRIPTION,
            )
            sub_obj.patch(params=param, format_type="merge")

        # Add tolerations to the ocsinitializations.ocs.openshift.io
        param = (
            '{"spec":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}')

        ocsini_obj = ocp.OCP(
            resource_name=constants.OCSINIT,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.OCSINITIALIZATION,
        )
        ocsini_obj.patch(params=param, format_type="merge")

        # Add tolerations to the configmap rook-ceph-operator-config
        configmap_obj = ocp.OCP(
            kind=constants.CONFIGMAP,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
        )
        toleration = configmap_obj.get().get("data").get(
            "CSI_PLUGIN_TOLERATIONS")
        toleration += (
            '\n- key: xyz\n  operator: Equal\n  value: "true"\n  effect: NoSchedule'
        )
        toleration = toleration.replace('"', '\\"').replace("\n", "\\n")
        param_cmd = (
            f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, '
            f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]'
        )
        configmap_obj.patch(params=param_cmd, format_type="json")

        # After edit noticed few pod respins as expected
        assert wait_for_pods_to_be_running(timeout=600, sleep=15)

        # Respin all pods and check it if is still running
        pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, )
        for pod in pod_list:
            pod.delete(wait=False)

        assert wait_for_pods_to_be_running(timeout=600, sleep=15)
        self.sanity_helpers.health_check()

        # Add capacity to check if new osds has toleration
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = ocp.OCP(kind=constants.POD,
                      namespace=config.ENV_DATA["cluster_namespace"])
        if is_flexible_scaling_enabled():
            replica_count = 1
        else:
            replica_count = 3
        assert pod.wait_for_resource(
            timeout=300,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=count * replica_count,
        ), "New OSDs failed to reach running state"
        check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)
Ejemplo n.º 25
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    def detach_volume_and_wait_for_attach(self, nodes, data_volume,
                                          worker_node):
        """
        Detach an EBS volume from an AWS instance and wait for the volume
        to be re-attached

        Args:
            node (OCS): The OCS object representing the node
            data_volume (Volume): The ec2 volume to delete
            worker_node (OCS): The OCS object of the EC2 instance

        """
        try:
            # Detach volume (logging is done inside the function)
            nodes.detach_volume(data_volume, worker_node)
        except AWSTimeoutException as e:
            if "Volume state: in-use" in e:
                logger.info(
                    f"Volume {data_volume} re-attached successfully to worker"
                    f" node {worker_node}")
            else:
                raise
        else:
            """
            Wait for worker volume to be re-attached automatically
            to the node
            """
            assert nodes.wait_for_volume_attach(data_volume), (
                f"Volume {data_volume} failed to be re-attached to worker "
                f"node {worker_node}")

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    @bugzilla("1825675")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory,
                                         pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Wait for the volumes to be re-attached back to the worker node
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be
          unhealthy) by creating resources and running IO
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume and wait for the volume to attach
        self.detach_volume_and_wait_for_attach(nodes, data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            "worker": nodes.get_node_by_attached_volume(vol),
            "volume": vol
        } for vol in data_volumes]
        for worker_and_volume in workers_and_volumes:
            # Detach volume and wait for the volume to attach
            self.detach_volume_and_wait_for_attach(nodes,
                                                   worker_and_volume["volume"],
                                                   worker_and_volume["worker"])
        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume["worker"]
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla("1830702")
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        logger.info("Picking a PV which to be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [
            ds for ds in osd_pvcs
            if ds.get().get("metadata").get("name") == claim_name
        ][0]

        # Get the corresponding OSD pod and ID
        logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        logger.info(f"OSD_POD {osd_pod.name}")
        osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id")

        # Get the node that has the OSD pod running on
        logger.info(
            f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments()
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_deployment_name = osd_deployment.name

        # Delete the volume from the platform side
        logger.info(f"Deleting {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Scale down OSD deployment
        logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
        ocp.OCP().exec_oc_cmd(
            f"scale --replicas=0 deployment/{osd_deployment_name}")

        # Force delete OSD pod if necessary
        osd_pod_name = osd_pod.name
        logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
        try:
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
        except TimeoutError:
            osd_pod.delete(force=True)
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

        # Run ocs-osd-removal job
        ocp_version = float(get_ocp_version())
        if ocp_version >= 4.6:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml"
        else:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml"

        logger.info(f"Executing OSD removal job on OSD-{osd_id}")
        ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
        osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd)
        osd_removal_job = OCS(**osd_removal_job_yaml)
        osd_removal_job.create(do_reload=False)

        # Get ocs-osd-removal pod name
        logger.info("Getting the ocs-osd-removal pod name")
        osd_removal_pod_name = get_osd_removal_pod_name(osd_id)
        osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name,
                                          namespace="openshift-storage")
        osd_removal_pod_obj.ocp.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=osd_removal_pod_name)

        # Verify OSD removal from the ocs-osd-removal pod logs
        logger.info(
            f"Verifying removal of OSD from {osd_removal_pod_name} pod logs")
        logs = get_pod_logs(osd_removal_pod_name)
        pattern = f"purged osd.{osd_id}"
        assert re.search(pattern, logs)

        osd_pvc_name = osd_pvc.name

        if ocp_version < 4.6:
            # Delete the OSD prepare job
            logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
            osd_prepare_job.delete()
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=120)

            # Delete the OSD PVC
            logger.info(f"Deleting OSD PVC {osd_pvc_name}")
            osd_pvc.delete()
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

            # Delete the OSD deployment
            logger.info(f"Deleting OSD deployment {osd_deployment_name}")
            osd_deployment.delete()
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=120)
        else:
            # If ocp version is '4.6' and above the osd removal job should
            # delete the OSD prepare job, OSD PVC, OSD deployment
            logger.info(
                f"Verifying deletion of OSD prepare job {osd_prepare_job_name}"
            )
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=30)
            logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}")
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30)
            logger.info(
                f"Verifying deletion of OSD deployment {osd_deployment_name}")
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=30)

        # Delete PV
        logger.info(f"Verifying deletion of PV {osd_pv_name}")
        try:
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
        except TimeoutError:
            osd_pv.delete()
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

        if ocp_version < 4.6:
            # Delete the rook ceph operator pod to trigger reconciliation
            rook_operator_pod = get_operator_pods()[0]
            logger.info(
                f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
            rook_operator_pod.delete()

        # Delete the OSD removal job
        logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
        osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}")
        osd_removal_job.delete()
        osd_removal_job.ocp.wait_for_delete(
            resource_name=f"ocs-osd-removal-{osd_id}")

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info(
            "Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_BOUND,
            selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
            )
        # Wait for OSD pod to get created and reach Running state
        logger.info(
            "Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
            )

        # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
        # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
        if ocp_version >= 4.6:
            silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
                osd_pod_name)
            if not silence_osd_crash:
                logger.info("Didn't find ceph osd crash warning")

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=100)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
Ejemplo n.º 26
0
class TestRGWAndNoobaaDBHostNodeFailure(ManageTest):
    """
    Test to verify fail node hosting
    RGW pods and Noobaa-db pods and its impact

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def create_obc_creation(self, bucket_factory, mcg_obj, key):
        """"""
        # Create a bucket then read & write
        bucket_name = bucket_factory(amount=1, interface="OC")[0].name
        obj_data = "A random string data"
        assert s3_put_object(mcg_obj, bucket_name, key,
                             obj_data), f"Failed: Put object, {key}"
        assert s3_get_object(mcg_obj, bucket_name,
                             key), f"Failed: Get object, {key}"

    def test_rgw_host_node_failure(self, nodes, node_restart_teardown, mcg_obj,
                                   bucket_factory):
        """
        Test case to fail node where RGW and the NooBaa DB are hosted
        and verify the new pods spin on a healthy node

        """
        # Get rgw pods
        rgw_pod_obj = get_rgw_pods()

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        noobaa_pod_node = None
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name in [
                    constants.NB_DB_NAME_46_AND_BELOW,
                    constants.NB_DB_NAME_47_AND_ABOVE,
            ]:
                noobaa_pod_node = get_pod_node(noobaa_pod)
        if noobaa_pod_node is None:
            assert False, "Could not find the NooBaa DB pod"

        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(f"Stopping node {pod_node} where"
                         f" rgw pod {rgw_pod.name} and NooBaa DB are hosted")
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(resource=rgw_pod,
                                        state=constants.STATUS_TERMINATING,
                                        timeout=720)

                # Validate new rgw pod spun
                ocp_obj = OCP(kind=constants.POD,
                              namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj,
                                         "Object-key-1")

                # Start the node
                nodes.start_nodes(node_obj)

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj,
                                         "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()

        # Verify all storage pods are running
        wait_for_storage_pods()
Ejemplo n.º 27
0
class TestMonitorRecovery(E2ETest):
    """
    Test to verify monitor recovery

    """

    @pytest.fixture(autouse=True)
    def mon_recovery_setup(
        self,
        dc_pod_factory,
        mcg_obj,
        bucket_factory,
    ):
        """
        Creates project, pvcs, dc-pods and obcs

        """
        self.filename = "sample_file.txt"
        self.object_key = "obj-key"
        self.object_data = "string data"
        self.dd_cmd = f"dd if=/dev/urandom of=/mnt/{self.filename} bs=5M count=1"

        self.sanity_helpers = Sanity()
        # Create project, pvc, dc pods
        self.dc_pods = []
        self.dc_pods.append(
            dc_pod_factory(
                interface=constants.CEPHBLOCKPOOL,
            )
        )
        self.dc_pods.append(
            dc_pod_factory(
                interface=constants.CEPHFILESYSTEM,
                access_mode=constants.ACCESS_MODE_RWX,
            )
        )
        self.md5sum = []
        for pod_obj in self.dc_pods:
            pod_obj.exec_cmd_on_pod(command=self.dd_cmd)
            # Calculate md5sum
            self.md5sum.append(pod.cal_md5sum(pod_obj, self.filename))
        logger.info(f"Md5sum calculated before recovery: {self.md5sum}")

        self.bucket_name = bucket_factory(interface="OC")[0].name
        logger.info(f"Putting object on: {self.bucket_name}")
        assert bucket_utils.s3_put_object(
            s3_obj=mcg_obj,
            bucketname=self.bucket_name,
            object_key=self.object_key,
            data=self.object_data,
        ), "Failed: PutObject"

    def test_monitor_recovery(
        self,
        dc_pod_factory,
        mcg_obj,
        bucket_factory,
    ):
        """
        Verifies Monitor recovery procedure as per:
        https://access.redhat.com/documentation/en-us/red_hat_openshift_container_storage/4.8/html/troubleshooting_openshift_container_storage/restoring-the-monitor-pods-in-openshift-container-storage_rhocs

        """
        # Initialize mon recovery class
        mon_recovery = MonitorRecovery()

        logger.info("Corrupting ceph monitors by deleting store.db")
        corrupt_ceph_monitors()

        logger.info("Backing up all the deployments")
        mon_recovery.backup_deployments()
        dep_revert, mds_revert = mon_recovery.deployments_to_revert()

        logger.info("Starting the monitor recovery procedure")
        logger.info("Scaling down rook and ocs operators")
        mon_recovery.scale_rook_ocs_operators(replica=0)

        logger.info(
            "Preparing script and patching OSDs to remove LivenessProbe and sleep to infinity"
        )
        mon_recovery.prepare_monstore_script()
        mon_recovery.patch_sleep_on_osds()
        switch_to_project(constants.OPENSHIFT_STORAGE_NAMESPACE)

        logger.info("Getting mon-store from OSDs")
        mon_recovery.run_mon_store()

        logger.info("Patching MONs to sleep infinitely")
        mon_recovery.patch_sleep_on_mon()

        logger.info("Updating initial delay on all monitors")
        update_mon_initial_delay()

        logger.info("Generating monitor map command using the IPs")
        mon_map_cmd = generate_monmap_cmd()

        logger.info("Getting ceph keyring from ocs secrets")
        mon_recovery.get_ceph_keyrings()

        logger.info("Rebuilding Monitors to recover store db")
        mon_recovery.monitor_rebuild(mon_map_cmd)

        logger.info("Reverting mon, osd and mgr deployments")
        mon_recovery.revert_patches(dep_revert)

        logger.info("Scaling back rook and ocs operators")
        mon_recovery.scale_rook_ocs_operators(replica=1)

        logger.info("Recovering CephFS")
        mon_recovery.scale_rook_ocs_operators(replica=0)
        logger.info(
            "Patching MDSs to remove LivenessProbe and setting sleep to infinity"
        )
        mon_recovery.patch_sleep_on_mds()
        logger.info("Resetting the fs")
        ceph_fs_recovery()
        logger.info("Reverting MDS deployments")
        mon_recovery.revert_patches(mds_revert)
        logger.info("Scaling back rook and ocs operators")
        mon_recovery.scale_rook_ocs_operators(replica=1)
        logger.info("Recovering mcg by re-spinning the pods")
        recover_mcg()
        remove_global_id_reclaim()
        for pod_obj in self.dc_pods:
            pod_obj.delete(force=True)
        new_md5_sum = []
        logger.info("Verifying md5sum of files after recovery")
        for pod_obj in get_spun_dc_pods(self.dc_pods):
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=10,
            )
            new_md5_sum.append(pod.cal_md5sum(pod_obj, self.filename))
        logger.info(f"Md5sum calculated after recovery: {new_md5_sum}")
        if collections.Counter(new_md5_sum) == collections.Counter(self.md5sum):
            logger.info(
                f"Verified: md5sum of {self.filename} on pods matches with the original md5sum"
            )
        else:
            assert False, f"Data corruption found {new_md5_sum} and {self.md5sum}"
        logger.info("Getting object after recovery")
        assert bucket_utils.s3_get_object(
            s3_obj=mcg_obj,
            bucketname=self.bucket_name,
            object_key=self.object_key,
        ), "Failed: GetObject"

        # New pvc, dc pods, obcs
        new_dc_pods = [
            dc_pod_factory(
                interface=constants.CEPHBLOCKPOOL,
            ),
            dc_pod_factory(
                interface=constants.CEPHFILESYSTEM,
            ),
        ]
        for pod_obj in new_dc_pods:
            pod_obj.exec_cmd_on_pod(command=self.dd_cmd)
        logger.info("Creating new bucket and write object")
        new_bucket = bucket_factory(interface="OC")[0].name
        assert bucket_utils.s3_put_object(
            s3_obj=mcg_obj,
            bucketname=new_bucket,
            object_key=self.object_key,
            data=self.object_data,
        ), "Failed: PutObject"
        wait_for_storage_pods()
        logger.info("Archiving the ceph crash warnings")
        tool_pod = get_ceph_tools_pod()
        tool_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all", format=None)
        self.sanity_helpers.health_check(tries=10)
Ejemplo n.º 28
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        if storagecluster_independent_check():
            self.sanity_helpers = SanityExternalCluster()
        else:
            self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def health_checker(self):
        """
        Check Ceph health

        """
        try:
            status = ceph_health_check_base()
            if status:
                log.info("Health check passed")
        except CephHealthException as e:
            # skip because ceph is not in good health
            pytest.skip(str(e))

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*["worker"],
                         marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-1272")),
        ],
    )
    def test_node_maintenance(
        self,
        reduce_and_resume_cluster_load,
        node_type,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=90)

    @tier4
    @tier4b
    @skipif_bm
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*["worker"],
                         marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-1293")),
        ],
    )
    def test_node_maintenance_restart_activate(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        node_type,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        reboot_events_cmd = (
            f"get events -A --field-selector involvedObject.name="
            f"{typed_node_name},reason=Rebooted -o yaml")

        # Find the number of reboot events in 'typed_node_name'
        num_events = len(
            typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=False)

        try:
            wait_for_nodes_status(
                node_names=[typed_node_name],
                status=constants.NODE_NOT_READY_SCHEDULING_DISABLED,
            )
        except ResourceWrongStatusException:
            # Sometimes, the node will be back to running state quickly so
            # that the status change won't be detected. Verify the node was
            # actually restarted by checking the reboot events count
            new_num_events = len(
                typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])
            assert new_num_events > num_events, (
                f"Reboot event not found."
                f"Node {typed_node_name} did not restart.")

        wait_for_nodes_status(
            node_names=[typed_node_name],
            status=constants.NODE_READY_SCHEDULING_DISABLED,
        )

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

    @tier3
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*["worker"],
                         marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-1271")),
        ],
    )
    def test_2_nodes_maintenance_same_type(self, nodes_type):
        """
        OCS-1273/OCs-1271:
        - Try draining 2 nodes from the same type - should fail
        - Check cluster and Ceph health

        """
        # Get 2 nodes
        typed_nodes = get_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Try draining 2 nodes - should fail
        try:
            drain_nodes(typed_node_names)
        except TimeoutExpired:
            log.info(
                f"Draining of nodes {typed_node_names} failed as expected")

        schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory,
                                     bucket_factory, rgw_bucket_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            get_nodes(node_type=node_type, num_of_nodes=1)[0]
            for node_type in ["worker", "master"]
        ]
        assert nodes, "Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @ipi_deployment_required
    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*["rbd"], marks=pytest.mark.polarion_id("OCS-2128")),
            pytest.param(*["cephfs"],
                         marks=pytest.mark.polarion_id("OCS-2129")),
        ],
    )
    def test_simultaneous_drain_of_two_ocs_nodes(
        self,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key="dc",
                          label_value="fedora")
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == "rbd" else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if "rook-ceph-crashcollector" in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = "-".join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @bugzilla("1861104")
    @pytest.mark.polarion_id("OCS-2524")
    @tier4b
    def test_pdb_check_simultaneous_node_drains(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_drain_teardown,
    ):
        """
        - Check for OSD PDBs before drain
        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
        - Drain will be completed on worker node A
        - Drain will be pending on worker node B due to blocking PDBs
        - Check the OSD PDBs
        - Mark the node A as schedulable
        - Let drain finish on Node B
        - Mark the node B as schedulable
        - Check cluster and Ceph health

        """

        # Validate OSD PDBs before drain operation
        assert (not validate_existence_of_blocking_pdb()
                ), "Blocking PDBs exist, Can't perform drain"

        # Get 2 worker nodes to drain
        typed_nodes = get_nodes(num_of_nodes=2)
        assert len(
            typed_nodes) == 2, "Failed to find worker nodes for the test"
        node_A = typed_nodes[0].name
        node_B = typed_nodes[1].name

        # Drain Node A and validate blocking PDBs
        drain_nodes([node_A])
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post drain"

        # Inducing delay between 2 drains
        # Node-B drain expected to be in pending due to blocking PDBs
        time.sleep(30)
        try:
            drain_nodes([node_B])
        except TimeoutExpired:
            # Mark the node-A back to schedulable and let drain finish in Node-B
            schedule_nodes([node_A])

        time.sleep(40)

        # Validate OSD PDBs
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post second drain"

        # Mark the node-B back to schedulable and recover the cluster
        schedule_nodes([node_B])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=validate_existence_of_blocking_pdb,
        )
        if not sample.wait_for_func_status(result=False):
            log.error("Blocking PDBs still exist")

        # wait for storage pods
        pod.wait_for_storage_pods()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=50)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()
Ejemplo n.º 29
0
class TestRegistryShutdownAndRecoveryNode(E2ETest):
    """
    Test to shutdown and recovery node and
    its impact on registry
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def setup(self, project_factory, node_restart_teardown):
        """
        Setup and clean up the namespace
        """

        self.project_name = "test"
        project_factory(project_name=self.project_name)

    @pytest.mark.polarion_id("OCS-1800")
    @skipif_ibm_cloud
    def test_registry_shutdown_and_recovery_node(self, nodes):
        """
        Test registry workload when backed by OCS and
        its impact when node is shutdown and recovered

        """

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template="eap-cd-basic-s2i",
            image=
            "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest",
            pattern="eap-app",
        )

        # Get the node list
        node_list = get_nodes(node_type="worker")

        for node in node_list:

            # Stop node
            nodes.stop_nodes(nodes=[node])

            # Validate node reached NotReady state
            wait_for_nodes_status(node_names=[node.name],
                                  status=constants.NODE_NOT_READY)

            # Start node
            nodes.start_nodes(nodes=[node])

            # Validate all nodes are in READY state and up
            retry(
                (
                    CommandFailed,
                    TimeoutError,
                    AssertionError,
                    ResourceWrongStatusException,
                ),
                tries=60,
                delay=15,
            )(wait_for_nodes_status)(timeout=900)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*["rbd"], marks=pytest.mark.polarion_id("OCS-2100")),
            pytest.param(*["cephfs"], marks=pytest.mark.polarion_id("OCS-2101")),
        ],
    )
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
        self, interface, pvc_factory, pod_factory, dc_pod_factory
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (
            constants.CEPHBLOCKPOOL if interface == "rbd" else constants.CEPHFILESYSTEM
        )
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name
        )
        msg = "Common OSD and app running node(s) NOT found"
        assert len(common_nodes) > 0, msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(machine_name)
        log.info(f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()