Beispiel #1
0
def test_deployment(pvc_factory, pod_factory):
    deploy = config.RUN['cli_params'].get('deploy')
    teardown = config.RUN['cli_params'].get('teardown')
    if not teardown or deploy:
        log.info("Verifying OCP cluster is running")
        assert is_cluster_running(config.ENV_DATA['cluster_path'])
        if not config.ENV_DATA['skip_ocs_deployment']:
            ocs_registry_image = config.DEPLOYMENT.get('ocs_registry_image')
            ocs_install_verification(ocs_registry_image=ocs_registry_image)

            nb_eps = config.DEPLOYMENT.get('noobaa_endpoints')
            if nb_eps > 1:
                change_noobaa_endpoints_count(nb_eps)

            # Check basic cluster functionality by creating resources
            # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
            # run IO and delete the resources
            sanity_helpers = Sanity()
            sanity_helpers.health_check()
            sanity_helpers.create_resources(pvc_factory, pod_factory)
            sanity_helpers.delete_resources()

    if teardown:
        log.info(
            "Cluster will be destroyed during teardown part of this test.")
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement proactive

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive(self):
        """
        Knip-894 Node Replacement proactive(without IO running)

        """
        osd_node_name = select_osd_node_name()
        delete_and_create_osd_node(osd_node_name)

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=90)
        ceph_cluster_obj = CephCluster()
        assert ceph_cluster_obj.wait_for_rebalance(
            timeout=1800), ("Data re-balance failed to complete")
class TestNodeReplacementWithIO(ManageTest):
    """
    Knip-894 Node replacement proactive with IO

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive_with_io_running(self, pvc_factory,
                                                       pod_factory,
                                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive when IO running in the background

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_node_name = select_osd_node_name()

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        delete_and_create_osd_node(osd_node_name)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
Beispiel #4
0
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, pvc_factory, pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get the osd associated node name
        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"{osd_node_name} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(f"{osd_node_name} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=200)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestCouchBaseNodeReboot(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """
    @pytest.fixture()
    def cb_setup(self, couchbase_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_factory_fixture(replicas=3,
                                            run_in_bg=True,
                                            skip_analyze=True)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["pod_name_of_node"],
        argvalues=[
            pytest.param(*['osd'], marks=pytest.mark.polarion_id("OCS-776")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-783")),
            pytest.param(*['couchbase'],
                         marks=pytest.mark.polarion_id("OCS-776"))
        ])
    def test_run_couchbase_node_reboot(self, cb_setup, nodes,
                                       pod_name_of_node):
        """
        Test couchbase workload with node reboot
        """
        if pod_name_of_node == 'couchbase':
            node_list = self.cb.get_couchbase_nodes()
        elif pod_name_of_node == 'osd':
            node_list = get_osd_running_nodes()
        elif pod_name_of_node == 'master':
            node_list = get_master_nodes()

        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)
        get_node_resource_utilization_from_adm_top(node_type='master',
                                                   print_table=True)
        # Restart relevant node
        nodes.restart_nodes(node_1)
        for sample in TimeoutSampler(300, 5, self.cb.result.done):
            if sample:
                break
            else:
                logging.info(
                    "#### ....Waiting for couchbase threads to complete...")
        self.sanity_helpers.health_check()
Beispiel #6
0
class TestPgSQLNodeReboot(E2ETest):
    """
    Test running PGSQL and with Ceph pods respin
    """
    @pytest.fixture()
    def pgsql_setup(self, pgsql):
        """
        PGSQL test setup
        """
        # Deployment of postgres database
        pgsql.setup_postgresql(replicas=3)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.usefixtures(pgsql_setup.__name__)
    def test_run_pgsql_node_drain(
        self, pgsql, transactions=900, node_type='master'
    ):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(
            replicas=3, transactions=transactions, clients=3
        )

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(
            node_type=node_type, num_of_nodes=1
        )
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
class TestCouchBaseNodeDrain(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """
    @pytest.fixture()
    def cb_setup(self, couchbase_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_factory_fixture(
            replicas=3, run_in_bg=True, skip_analyze=True
        )

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    def test_run_couchbase_node_drain(self, cb_setup, node_type='master'):
        """
        Test couchbase workload with node drain
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(
            node_type='worker', print_table=True
        )

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(
            node_type=node_type, num_of_nodes=1
        )
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        for sample in TimeoutSampler(300, 5, self.cb.result.done):
            if sample:
                break
            else:
                logging.info(
                    "#### ....Waiting for couchbase threads to complete..."
                )
        utils.ceph_health_check()
Beispiel #8
0
class TestCouchBaseNodeDrain(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """
    @pytest.fixture()
    def cb_setup(self, couchbase_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_factory_fixture(replicas=3,
                                            run_in_bg=True,
                                            skip_analyze=True)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    def test_run_couchbase_node_drain(self, cb_setup, node_type='master'):
        """
        Test couchbase workload with node drain
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check()
Beispiel #9
0
class TestCouchBasePodRespin(E2ETest):
    """
    Deploy an CouchBase workload using operator
    """
    @pytest.fixture()
    def cb_setup(self, couchbase_factory_fixture):
        """
        Creates couchbase workload
        """
        self.cb = couchbase_factory_fixture(replicas=3,
                                            run_in_bg=True,
                                            skip_analyze=True)
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["pod_name"],
        argvalues=[
            pytest.param(*['osd'], marks=pytest.mark.polarion_id("OCS-780")),
            pytest.param(*['mon'], marks=pytest.mark.polarion_id("OCS-779")),
            pytest.param(*['mgr'], marks=pytest.mark.polarion_id("OCS-781")),
            pytest.param(*['couchbase'],
                         marks=pytest.mark.polarion_id("OCS-786")),
        ])
    def test_run_couchbase_respin_pod(self, cb_setup, pod_name):
        log.info(f"Respin Ceph pod {pod_name}")

        if pod_name == 'couchbase':
            self.cb.respin_couchbase_app_pod()
        else:
            disruption = Disruptions()
            disruption.set_resource(resource=f'{pod_name}')
            disruption.delete_resource()

        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")),
            pytest.param(*['cephfs'],
                         marks=pytest.mark.polarion_id("OCS-2101")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #11
0
class TestJenkinsNodeDrain(E2ETest):
    """
    Test running Jenkins and Node Drain
    """
    @pytest.fixture()
    def jenkins_setup(self, jenkins):
        """
        JENKINS test setup
        """
        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

        # Deployment of jenkins
        jenkins.create_ocs_jenkins_template()

    @pytest.mark.parametrize(
        argnames=['node_type', 'num_projects', 'num_of_builds'],
        argvalues=[
            pytest.param(*[WORKER_MACHINE, 4, 3],
                         marks=pytest.mark.polarion_id("OCS-2177")),
            pytest.param(*[MASTER_MACHINE, 3, 6],
                         marks=pytest.mark.polarion_id("OCS-2176")),
        ])
    @pytest.mark.usefixtures(jenkins_setup.__name__)
    def test_run_jenkins_drain_node(self, jenkins, node_type, num_projects,
                                    num_of_builds):
        """

        Test Node Drain jenkins
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Get relevant node
        nodes_drain = jenkins.get_node_name_where_jenkins_pod_not_hosted(
            node_type=node_type, num_of_nodes=1)

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        if len(nodes_drain) > 0:
            # Node maintenance - to gracefully terminate all pods on the node
            drain_nodes(nodes_drain)
            # Make the node  schedulable again
            schedule_nodes(nodes_drain)

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #12
0
class TestPgSQLNodeReboot(E2ETest):
    """
    Test running PGSQL and with Ceph pods respin
    """
    @pytest.fixture()
    def pgsql_setup(self, pgsql):
        """
        PGSQL test setup
        """
        # Deployment of postgres database
        pgsql.setup_postgresql(replicas=3)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["transactions", "pod_name"],
        argvalues=[
            pytest.param(*[600, 'osd'],
                         marks=pytest.mark.polarion_id("OCS-801")),
            pytest.param(*[600, 'postgres'],
                         marks=pytest.mark.polarion_id("OCS-799"))
        ])
    @pytest.mark.usefixtures(pgsql_setup.__name__)
    def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Choose a node based on pod it contains
        if pod_name == 'postgres':
            node_list = pgsql.get_pgsql_nodes()
        elif pod_name == 'osd':
            node_list = get_osd_running_nodes()
        node_1 = get_node_objs(node_list[random.randint(0,
                                                        len(node_list) - 1)])

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Restart relevant node
        nodes.restart_nodes(node_1)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #13
0
class TestDetachAttachWorkerVolume(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.polarion_id("OCS-1085")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory,
                                         pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Wait for the volumes to be re-attached back to the worker node
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]

        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume (logging is done inside the function)
        nodes.detach_volume(data_volume)

        # Validate cluster is still functional
        try:
            # In case the selected node that its volume disk was detached was the one
            # running the ceph tools pod, we'll need to wait for a new ct pod to start.
            # For that, a function that connects to the ct pod is being used to check if
            # it's alive
            _ = get_admin_key()
        except CommandFailed as ex:
            if "connection timed out" in str(ex):
                logger.info(
                    "Ceph tools box was running on the node that its data "
                    "volume has been detached. Hence, waiting for a new "
                    "Ceph tools box pod to spin up")
                wait_for_resource_count_change(
                    func_to_use=get_all_pods,
                    previous_num=1,
                    namespace=config.ENV_DATA['cluster_namespace'],
                    timeout=120,
                    selector='app=rook-ceph-tools')
            else:
                raise
        finally:
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Wait for worker volume to be re-attached automatically to the node
        assert nodes.wait_for_volume_attach(data_volume), (
            f"Volume {data_volume.id} failed to be re-attached to a worker node"
        )

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            'worker': nodes.get_node_by_attached_volume(vol),
            'volume': vol
        } for vol in data_volumes]

        for worker_and_volume in workers_and_volumes:
            # Detach the volume (logging is done inside the function)
            nodes.detach_volume(worker_and_volume['volume'])

        for worker_and_volume in workers_and_volumes:
            # Wait for worker volume to be re-attached automatically to the node
            assert nodes.wait_for_volume_attach(worker_and_volume['volume']), (
                f"Volume {worker_and_volume['volume']} "
                f"failed to be re-attached to a worker node")

        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume['worker']
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
Beispiel #14
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272"))
        ]
    )
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        node.drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.skipif(
        condition=config.ENV_DATA['platform'] != 'AWS',
        reason="Tests are not running on AWS deployed cluster"
    )
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1293"))
        ]
    )
    def test_node_maintenance_restart_activate(
        self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type
    ):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node's ec2 instance
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node
        typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_node, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_node[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        node.drain_nodes([typed_node_name])

        instance = aws.get_instances_ids_and_names(typed_node)
        assert instance, f"Failed to get ec2 instances for node {typed_node_name}"

        # Restarting ec2 instance
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        node.wait_for_nodes_status(
            node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED
        )
        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier2
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271"))
        ]
    )
    def test_2_nodes_maintenance_same_type(
        self, pvc_factory, pod_factory, nodes_type
    ):
        """
        OCS-1273/OCs-1271:
        - Maintenance (mark as unscheduable and drain) 2 worker/master nodes
        - Mark the nodes as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 2 nodes
        typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(typed_node_names)

        # Mark the nodes back to schedulable
        node.schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            node.get_typed_nodes(
                node_type=node_type, num_of_nodes=1
            )[0] for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        node.schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestJenkinsNodeReboot(E2ETest):
    """
    Test running Jenkins and Node Reboot
    """
    @pytest.fixture()
    def jenkins_setup(self, jenkins):
        """
        JENKINS test setup
        """
        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

        # Deployment of jenkins
        jenkins.create_ocs_jenkins_template()

    @pytest.mark.parametrize(
        argnames=['node_type', 'num_projects', 'num_of_builds'],
        argvalues=[
            pytest.param(*[MASTER_MACHINE, 2, 15],
                         marks=pytest.mark.polarion_id("OCS-2202")),
            pytest.param(*[WORKER_MACHINE, 2, 15],
                         marks=pytest.mark.polarion_id("OCS-2178")),
        ])
    @pytest.mark.usefixtures(jenkins_setup.__name__)
    def test_run_jenkins_node_reboot(self, jenkins, nodes, node_type,
                                     num_projects, num_of_builds):
        """

        Test Node Reboot jenkins
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Get relevant node
        nodes_reboot = jenkins.get_node_name_where_jenkins_pod_not_hosted(
            node_type=node_type, num_of_nodes=1)

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        if len(nodes_reboot) > 0:
            # Restart Node
            nodes.restart_nodes(get_node_objs(nodes_reboot))
        else:
            log.info('No node was reboot')

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")),
            pytest.param(*['cephfs'],
                         marks=pytest.mark.polarion_id("OCS-2101")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
            self, interface, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        msg = "Common OSD and app running node(s) NOT found"
        assert (len(common_nodes) > 0), msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Check all OCS pods status, they should be in running state
        all_pod_obj = pod.get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")
                    else:
                        raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #17
0
class TestMonitoringBackedByOCS(E2ETest):
    """
    Test cases to validate monitoring backed by OCS
    """
    num_of_pvcs = 5
    pvc_size = 5

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady or unschedulable,
        for situations in which the test failed in between restarting
        or scheduling those nodes

        """
        def finalizer():

            # Validate all nodes are schedulable
            scheduling_disabled_nodes = [
                n.name for n in get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_READY_SCHEDULING_DISABLED
            ]
            if scheduling_disabled_nodes:
                schedule_nodes(scheduling_disabled_nodes)

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")

            assert prometheus_health_check(), "Prometheus health is degraded"

        request.addfinalizer(finalizer)

    @pytest.fixture()
    def pods(self, multi_pvc_factory, dc_pod_factory):
        """
        Prepare multiple dc pods for the test

        Returns:
            list: Pod instances

        """
        sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL)

        pvc_objs = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL,
                                     storageclass=sc,
                                     size=self.pvc_size,
                                     num_of_pvc=self.num_of_pvcs)

        pod_objs = []
        for pvc_obj in pvc_objs:
            pod_objs.append(dc_pod_factory(pvc=pvc_obj))

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
        return pod_objs

    @pytest.mark.polarion_id("OCS-576")
    def test_monitoring_after_restarting_prometheus_pod(self, pods):
        """
        Test case to validate prometheus pod restart
        should not have any functional impact

        """

        # Get the prometheus pod
        prometheus_pod_obj = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_object in prometheus_pod_obj:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_object.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Restart the prometheus pod
            pod_object.delete(force=True)
            pod_obj = ocp.OCP(kind=constants.POD,
                              namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert pod_obj.wait_for_resource(condition='Running',
                                             selector=f'app=prometheus',
                                             timeout=60)

            # Check the same pvc is mounted on new pod
            pod_info = pod_object.get()
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_object.name}"
                )

        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-579")
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-580")
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ['mgr', 'mon', 'osd']
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-605")
    def test_monitoring_when_osd_down(self, pods):
        """
        Test case to validate monitoring when osd is down

        """

        # Get osd pods
        osd_pod_list = pod.get_osd_pods()

        # Make one of the osd down(first one)
        resource_name = osd_pod_list[0].get().get('metadata').get('name')
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=0)

        # Validate osd is down
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        pod_obj.wait_for_delete(resource_name=resource_name), (
            f"Resources is not deleted {resource_name}")

        # Check for the created pvc metrics when osd is down
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

        # Make osd up which was down
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=1)

        # Validate osd is up and ceph health is ok
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-606")
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, nodes, pods):
        """
        Test case to validate when the prometheus pod is down and its
        interaction with prometheus

        """

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the node where the prometheus pod is hosted
            pod_node_obj = pod.get_pod_node(pod_obj)

            # Make one of the node down where the prometheus pod is hosted
            nodes.restart_nodes([pod_node_obj])

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the prometheus pods are up
        for pod_obj in pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check for the created pvc metrics after restarting node where prometheus pod is hosted
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
            log.info(
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected"
            )

    @pytest.mark.polarion_id("OCS-709")
    def test_monitoring_after_rebooting_master_node(self, nodes, pods):
        """
        Test case to validate rebooting master node shouldn't delete
        the data collected on prometheus pod

        """

        # Get the master node list
        master_nodes = get_typed_nodes(node_type='master')

        # Reboot one after one master nodes
        for node in master_nodes:
            nodes.restart_nodes([node])

            wait_for_nodes_status_and_prometheus_health_check(pods)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-710")
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        wait_for_nodes_status()

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-mgr',
                                         timeout=600)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-mon',
                                         resource_count=3,
                                         timeout=600)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-osd',
                                         resource_count=3,
                                         timeout=600)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-711")
    def test_monitoring_shutdown_and_recovery_prometheus_node(
            self, nodes, pods):
        """
        Test case to validate whether shutdown and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get all prometheus pods
        prometheus_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for prometheus_pod_obj in prometheus_pod_obj_list:
            # Get the node where the prometheus pod is hosted
            prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj)

            # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted
            nodes.stop_nodes([prometheus_node_obj])

            waiting_time = 20
            log.info(f"Waiting for {waiting_time} seconds")
            time.sleep(waiting_time)

            nodes.start_nodes([prometheus_node_obj])

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check all the prometheus pods are up
        for pod_obj in prometheus_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after shutdown and recovery of prometheus nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
Beispiel #18
0
class FlowOperations:
    """
    Flow based operations class

    """
    def __init__(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def validate_cluster(self,
                         cluster_check=False,
                         node_status=False,
                         pod_status=False,
                         operation_name=""):
        """
        Validates various ceph and ocs cluster checks

        Args:
            node_status (bool): Verifies node is Ready
            pod_status (bool): Verifies StorageCluster pods in expected state
            operation_name (str): Name of the operation, to Tag

        """
        logger.info(f"{operation_name}: Verifying cluster health")
        assert ceph_health_check(
            defaults.ROOK_CLUSTER_NAMESPACE,
            tries=100), "Entry criteria FAILED: Cluster is Unhealthy"
        if cluster_check:
            self.sanity_helpers.health_check(tries=100)
        if node_status:
            logger.info(f"{operation_name}: Verifying whether node is ready")
            wait_for_nodes_status(status=constants.NODE_READY, timeout=300)
        if pod_status:
            logger.info(
                f"{operation_name}: Verifying StorageCluster pods are in running/completed state"
            )
            assert check_pods_in_running_state(
            ), 'Some pods were not in expected state'

    def node_operations_entry_criteria(self,
                                       node_type,
                                       number_of_nodes,
                                       operation_name="Node Operation",
                                       network_fail_time=None):
        """
        Entry criteria function for node related operations

        Args:
            node_type (str): Type of node
            number_of_nodes (int): Number of nodes
            operation_name (str): Name of the node operation
            network_fail_time (int): Total time to fail the network in a node

        Returns:
            tuple: containing the params used in Node operations

        """
        self.validate_cluster(node_status=True, operation_name=operation_name)

        logger.info(f"Getting parameters related to: {operation_name}")
        typed_nodes = node.get_typed_nodes(node_type=node_type,
                                           num_of_nodes=number_of_nodes)
        if network_fail_time:
            return typed_nodes, network_fail_time
        else:
            return typed_nodes

    def add_capacity_entry_criteria(self):
        """
        Entry criteria verification function for add capacity operation

        Returns:
            tuple: containing the params used in add capacity exit operation

        """
        self.validate_cluster(operation_name="Add Capacity")

        logger.info(
            "Add capacity: Getting restart count of pods before adding capacity"
        )
        restart_count_before = pod_helpers.get_pod_restarts_count(
            defaults.ROOK_CLUSTER_NAMESPACE)

        logger.info(
            "Add capacity entry: Getting OSD pod count before adding capacity")
        osd_pods_before = pod_helpers.get_osd_pods()

        return osd_pods_before, restart_count_before

    def add_capacity_exit_criteria(self, restart_count_before,
                                   osd_pods_before):
        """
        Exit criteria function for Add capacity operation

        Args:
            restart_count_before (dict): Restart counts of pods
            osd_pods_before (list): List of OSD pods before

        """
        self.validate_cluster(operation_name="Add Capacity")

        logger.info(
            "Add capacity: Getting restart count of pods after adding capacity"
        )
        restart_count_after = pod_helpers.get_pod_restarts_count(
            defaults.ROOK_CLUSTER_NAMESPACE)
        logger.info(
            f"Sum of restart count before = {sum(restart_count_before.values())}"
        )
        logger.info(
            f"Sum of restart count after = {sum(restart_count_after.values())}"
        )
        assert sum(restart_count_before.values()) == sum(
            restart_count_after.values()
        ), "Exit criteria verification FAILED: One or more pods got restarted"

        osd_pods_after = pod_helpers.get_osd_pods()
        number_of_osds_added = len(osd_pods_after) - len(osd_pods_before)
        logger.info(
            f"Number of OSDs added = {number_of_osds_added}, "
            f"before = {len(osd_pods_before)}, after = {len(osd_pods_after)}")
        assert number_of_osds_added == 3, "Exit criteria verification FAILED: osd count mismatch"

        logger.info("Add capacity: Exit criteria verification: Success")
Beispiel #19
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n
                .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()
        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @aws_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Wait for the volumes to be re-attached back to the worker node
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume (logging is done inside the function)
        nodes.detach_volume(data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert wait_for_ct_pod_recovery(), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Wait for worker volume to be re-attached automatically to the node
        assert nodes.wait_for_volume_attach(data_volume), (
            "Volume failed to be re-attached to a worker node"
        )

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @aws_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [
            {'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol}
            for vol in data_volumes
        ]

        for worker_and_volume in workers_and_volumes:
            # Detach the volume (logging is done inside the function)
            nodes.detach_volume(
                worker_and_volume['volume'], nodes.detach_volume(worker_and_volume['worker'])
            )

        for worker_and_volume in workers_and_volumes:
            # Wait for worker volume to be re-attached automatically to the node
            assert nodes.wait_for_volume_attach(worker_and_volume['volume']), (
                f"Volume {worker_and_volume['volume']} "
                f"failed to be re-attached to a worker node"
            )

        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes(
            [worker_and_volume['worker'] for worker_and_volume in workers_and_volumes]
        )

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla('1830702')
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1787236#c16

        """
        logger.info("Picking a PV which will be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get('spec').get('claimRef').get('name')

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name][0]

        # Get the corresponding OSD pod
        logger.info(f"Getting the corresponding OSD pod of PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods if osd_pod.get()
            .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]

        # Get the node that has the OSD pod running on
        logger.info(f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        volume_size = osd_pvc.size
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get('metadata')
            .get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get('labels').get('job-name')
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the corresponding OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments() if osd_pod.get()
            .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]

        # Delete the volume from the platform side
        logger.info(f"Deleting volume {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Delete the OSD deployment
        osd_deployment_name = osd_deployment.name
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120)

        # Delete the OSD prepare job
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120)

        # Delete the OSD PVC
        osd_pvc_name = osd_pvc.name
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Recreate a volume from the platform side
        logger.info("Creating a replacing volume from the platform side")
        nodes.create_and_attach_volume(osd_node, volume_size)

        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info("Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count
        ), (
            f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
        )
        # Wait for OSD pod to get created and reach Running state
        logger.info("Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count
        ), (
            f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
        )

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=80)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestMonitoringBackedByOCS(E2ETest):
    """
    Test cases to validate monitoring backed by OCS
    """
    num_of_pvcs = 5
    pvc_size = 5

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady or unschedulable,
        for situations in which the test failed in between restarting
        or scheduling those nodes

        """
        def finalizer():

            # Validate all nodes are schedulable
            scheduling_disabled_nodes = [
                n.name for n in get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_READY_SCHEDULING_DISABLED
            ]
            if scheduling_disabled_nodes:
                schedule_nodes(scheduling_disabled_nodes)

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")

            assert prometheus_health_check(), "Prometheus health is degraded"

        request.addfinalizer(finalizer)

    @pytest.fixture()
    def pods(self, multi_pvc_factory, dc_pod_factory):
        """
        Prepare multiple dc pods for the test

        Returns:
            list: Pod instances

        """
        sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL)

        pvc_objs = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL,
                                     storageclass=sc,
                                     size=self.pvc_size,
                                     num_of_pvc=self.num_of_pvcs)

        pod_objs = []
        for pvc_obj in pvc_objs:
            pod_objs.append(dc_pod_factory(pvc=pvc_obj))

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
        return pod_objs

    @pytest.mark.polarion_id("OCS-576")
    def test_monitoring_after_restarting_prometheus_pod(self, pods):
        """
        Test case to validate prometheus pod restart
        should not have any functional impact

        """

        # Get the prometheus pod
        prometheus_pod_obj = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_object in prometheus_pod_obj:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_object.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Restart the prometheus pod
            pod_object.delete(force=True)
            pod_obj = ocp.OCP(kind=constants.POD,
                              namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert pod_obj.wait_for_resource(condition='Running',
                                             selector='app=prometheus',
                                             timeout=60)

            # Check the same pvc is mounted on new pod
            pod_info = pod_object.get()
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_object.name}"
                )

        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-579")
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-580")
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ['mgr', 'mon', 'osd']
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-605")
    def test_monitoring_when_osd_down(self, pods):
        """
        Test case to validate monitoring when osd is down

        """

        # Get osd pods
        osd_pod_list = pod.get_osd_pods()

        # Make one of the osd down(first one)
        resource_name = osd_pod_list[0].get().get('metadata').get('name')
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=0)

        # Validate osd is down
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        pod_obj.wait_for_delete(resource_name=resource_name), (
            f"Resources is not deleted {resource_name}")

        # Check for the created pvc metrics when osd is down
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

        # Make osd up which was down
        assert modify_osd_replica_count(resource_name=resource_name,
                                        replica_count=1)

        # Validate osd is up and ceph health is ok
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-606")
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, nodes, pods):
        """
        Test case to validate when the prometheus pod is down and its
        interaction with prometheus

        """

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the node where the prometheus pod is hosted
            pod_node_obj = pod.get_pod_node(pod_obj)

            # Make one of the node down where the prometheus pod is hosted
            nodes.restart_nodes([pod_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check all the prometheus pods are up
        for pod_obj in pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check for the created pvc metrics after restarting node where prometheus pod is hosted
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
            log.info(
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected"
            )

    @pytest.mark.polarion_id("OCS-709")
    def test_monitoring_after_rebooting_master_node(self, nodes, pods):
        """
        Test case to validate rebooting master node shouldn't delete
        the data collected on prometheus pod

        """

        # Get the master node list
        master_nodes = get_typed_nodes(node_type='master')

        # Reboot one after one master nodes
        for node in master_nodes:
            nodes.restart_nodes([node], wait=False)

            # Wait some time after rebooting master
            waiting_time = 40
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            wait_for_nodes_status_and_prometheus_health_check(pods)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-710")
    def test_monitoring_after_rebooting_node_where_mgr_is_running(
            self, nodes, pods):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod

        """

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        nodes.restart_nodes([mgr_node_obj])

        # Validate all nodes are in READY state
        retry((CommandFailed, ResourceWrongStatusException),
              tries=20,
              delay=15)(wait_for_nodes_status())

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-mgr',
                                         timeout=600)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-mon',
                                         resource_count=3,
                                         timeout=600)
        assert pod_obj.wait_for_resource(condition='Running',
                                         selector='app=rook-ceph-osd',
                                         resource_count=3,
                                         timeout=600)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-711")
    @skipif_aws_i3
    def test_monitoring_shutdown_and_recovery_prometheus_node(
            self, nodes, pods):
        """
        Test case to validate whether shutdown and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get all prometheus pods
        prometheus_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for prometheus_pod_obj in prometheus_pod_obj_list:
            # Get the node where the prometheus pod is hosted
            prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj)

            # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted
            nodes.stop_nodes([prometheus_node_obj])

            waiting_time = 20
            log.info(f"Waiting for {waiting_time} seconds")
            time.sleep(waiting_time)

            nodes.start_nodes(nodes=[prometheus_node_obj])

            # Validate all nodes are in READY state
            retry((CommandFailed, ResourceWrongStatusException),
                  tries=20,
                  delay=15)(wait_for_nodes_status())

        # Check all the prometheus pods are up
        for pod_obj in prometheus_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for the created pvc metrics after shutdown and recovery of prometheus nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

    @pytest.mark.polarion_id("OCS-638")
    def test_monitoring_delete_pvc(self):
        """
        Test case to validate whether delete pvcs+configmap and recovery of a
        node where monitoring pods running has no functional impact

        """
        # Get 'cluster-monitoring-config' configmap
        ocp_configmap = ocp.OCP(namespace=constants.MONITORING_NAMESPACE,
                                kind='configmap')
        configmap_dict = ocp_configmap.get(
            resource_name='cluster-monitoring-config')
        dir_configmap = tempfile.mkdtemp(prefix='configmap_')
        yaml_file = f'{dir_configmap}/configmap.yaml'
        templating.dump_data_to_temp_yaml(configmap_dict, yaml_file)

        # Get prometheus and alertmanager pods
        prometheus_alertmanager_pods = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus', 'alertmanager'])

        # Get all pvc on monitoring namespace
        pvc_objs_list = pvc.get_all_pvc_objs(
            namespace=constants.MONITORING_NAMESPACE)

        # Delete configmap
        ocp_configmap.delete(resource_name='cluster-monitoring-config')

        # Delete all pvcs on monitoring namespace
        pvc.delete_pvcs(pvc_objs=pvc_objs_list)

        # Check all the prometheus and alertmanager pods are up
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)

        # Create configmap
        ocp_configmap.create(yaml_file=dir_configmap)

        # Check all the PVCs are up
        for pvc_obj in pvc_objs_list:
            wait_for_resource_state(resource=pvc_obj,
                                    state=constants.STATUS_BOUND,
                                    timeout=180)

        # Check all the prometheus and alertmanager pods are up
        # and pvc are mounted on monitoring pods
        for pod_obj in prometheus_alertmanager_pods:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING,
                                    timeout=180)
            mount_point = pod_obj.exec_cmd_on_pod(
                command="df -kh",
                out_yaml_format=False,
            )
            assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}"
        log.info("Verified all pvc are mounted on monitoring pods")

        # Validate the prometheus health is ok
        assert prometheus_health_check(), (
            "Prometheus cluster health is not OK")

    @pytest.mark.polarion_id("OCS-1535")
    def test_monitoring_shutdown_mgr_pod(self, pods):
        """
        Montoring backed by OCS, bring mgr down(replica: 0) for some time
        and check ceph related metrics
        """
        # Check ceph metrics available
        assert check_ceph_metrics_available(), (
            "failed to get results for some metrics before Downscaling deployment mgr to 0"
        )

        # Get pod mge name and mgr deployment
        oc_deployment = ocp.OCP(kind=constants.DEPLOYMENT,
                                namespace=ROOK_CLUSTER_NAMESPACE)
        mgr_deployments = oc_deployment.get(
            selector=constants.MGR_APP_LABEL)['items']
        mgr = mgr_deployments[0]['metadata']['name']
        pod_mgr_name = get_pod_name_by_pattern(
            pattern=mgr, namespace=ROOK_CLUSTER_NAMESPACE)

        log.info(f"Downscaling deployment {mgr} to 0")
        oc_deployment.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}")

        log.info(f"Wait for a mgr pod {pod_mgr_name[0]} to be deleted")
        oc_pod = ocp.OCP(kind=constants.POD, namespace=ROOK_CLUSTER_NAMESPACE)
        oc_pod.wait_for_delete(resource_name=pod_mgr_name[0])

        log.info(f"Upscaling deployment {mgr} back to 1")
        oc_deployment.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}")

        log.info("Waiting for mgr pod to be reach Running state")
        oc_pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL)

        # Check ceph metrics available
        assert check_ceph_metrics_available(), (
            "failed to get results for some metrics after Downscaling and Upscaling deployment mgr"
        )
class TestRegistryPodRespin(E2ETest):
    """
    Test to run svt workload for pushing
    images to registry and with Ceph pods respin
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def setup(self, request):
        """
        Setup and clean up the namespace
        """

        self.project_name = 'test'
        ocp_obj = ocp.OCP(kind=constants.NAMESPACES)
        ocp_obj.new_project(project_name=self.project_name)

        def finalizer():
            log.info("Clean up and remove namespace")
            ocp_obj.exec_oc_cmd(command=f'delete project {self.project_name}')

            # Reset namespace to default
            ocp.switch_to_default_rook_cluster_project()
            ocp_obj.wait_for_delete(resource_name=self.project_name)

        request.addfinalizer(finalizer)

    @pytest.mark.parametrize(
        argnames=["pod_name"],
        argvalues=[
            pytest.param(*['mon'], marks=pytest.mark.polarion_id("OCS-1797")),
            pytest.param(*['osd'], marks=pytest.mark.polarion_id("OCS-1798")),
            pytest.param(*['mgr'], marks=pytest.mark.polarion_id("OCS-1799")),
            pytest.param(*['mds'], marks=pytest.mark.polarion_id("OCS-1790"))
        ])
    def test_registry_respin_pod(self, pod_name):
        """
        Test registry workload when backed by OCS respin of ceph pods
        """

        # Respin relevant pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f'{pod_name}')
        disruption.delete_resource()

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template='eap-cd-basic-s2i',
            image=
            'registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest',
            pattern='eap-app')

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check()
Beispiel #22
0
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Proactive

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive(self, pvc_factory, pod_factory,
                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        # Unscheduling node
        node.unschedule_nodes([osd_node_name])
        # Draining Node
        node.drain_nodes([osd_node_name])
        log.info("Getting machine name from specified node name")
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"Node {osd_node_name} associated machine is {machine_name}")
        log.info(
            f"Deleting machine {machine_name} and waiting for new machine to come up"
        )
        machine.delete_machine_and_check_state_of_new_spinned_machine(
            machine_name)
        new_machine_list = machine.get_machines()
        for machines in new_machine_list:
            # Trimming is done to get just machine name
            # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr
            # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b
            if re.match(machines.name[:-6], machine_name):
                new_machine_name = machines.name
        machineset_name = machine.get_machineset_from_machine_name(
            new_machine_name)
        log.info("Waiting for new worker node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)
        new_node_name = node.get_node_from_machine_name(new_machine_name)
        log.info("Adding ocs label to newly created worker node")
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(resource_name=new_node_name,
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_node_name} with OCS storage label")
        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check()
Beispiel #23
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def health_checker(self):
        """
        Check Ceph health

        """
        try:
            status = ceph_health_check_base()
            if status:
                log.info("Health check passed")
        except CephHealthException as e:
            # skip because ceph is not in good health
            pytest.skip(str(e))

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1272"))
        ])
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*['master'],
                         marks=[
                             pytest.mark.polarion_id("OCS-1293"),
                             bugzilla('1754287')
                         ])
        ])
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

    @tier3
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1271"))
        ])
    def test_2_nodes_maintenance_same_type(self, nodes_type):
        """
        OCS-1273/OCs-1271:
        - Try draining 2 nodes from the same type - should fail
        - Check cluster and Ceph health

        """
        # Get 2 nodes
        typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Try draining 2 nodes - should fail
        try:
            drain_nodes(typed_node_names)
        except TimeoutExpired:
            log.info(
                f"Draining of nodes {typed_node_names} failed as expected")

        schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            get_typed_nodes(node_type=node_type, num_of_nodes=1)[0]
            for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2128")),
            pytest.param(*['cephfs'],
                         marks=pytest.mark.polarion_id("OCS-2129")),
        ])
    def test_simultaneous_drain_of_two_ocs_nodes(self, pvc_factory,
                                                 pod_factory, dc_pod_factory,
                                                 interface):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key='dc',
                          label_value='fedora')
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == 'rbd' else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if 'rook-ceph-crashcollector' in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = '-'.join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #24
0
class TestDetachAttachWorkerVolumeAWS(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.polarion_id("OCS-1085")
    def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Attach back the volume to the node
        - Restart the node so the volume will get re-mounted

        """
        # Requesting 1 worker node for the test as this case includes detach and
        # attach of data volume of 1 worker node
        worker = node.get_typed_nodes(num_of_nodes=1)
        assert worker, "Failed to find a worker node for the test"
        worker = worker[0]

        # Get the worker node's ec2 instance ID and name
        instance = aws.get_instances_ids_and_names([worker])
        assert instance, f"Failed to get ec2 instances for node {worker.name}"

        instance_id = [*instance][0]

        # Get the ec2 instance data volume Volume instance
        ec2_volume = aws.get_data_volumes(instance_id)[0]

        # Detach volume (logging is done inside the function)
        aws_obj.detach_volume(ec2_volume)

        # Validate cluster is still functional
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Attach volume (logging is done inside the function)
        aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instance so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        # Cluster health check
        self.sanity_helpers.health_check()

    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data volume from 2 of the worker nodes
        - Attach back the volume to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Requesting 2 worker nodes for the test as this case includes
        # detach and attach of data volume of 1 worker node
        workers = node.get_typed_nodes(num_of_nodes=2)
        assert workers, "Failed to find worker nodes for the test"

        # Get the worker nodes ec2 instance IDs and names
        instances = aws.get_instances_ids_and_names(workers)
        assert instances, (
            f"Failed to get ec2 instances for node {[w.name for w in workers]}"
        )

        for instance in instances.items():
            instance_id = [*instance][0]

            # Get the ec2 instance data volume Volume instance
            ec2_volume = aws.get_data_volumes(instance_id)[0]

            # Detach volume (logging is done inside the function)
            aws_obj.detach_volume(ec2_volume)

            # Attach volume (logging is done inside the function)
            aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instances so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instances, wait=True)

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
Beispiel #25
0
class TestWhenOneOfThePrometheusNodeDown(E2ETest):
    """
    When the nodes are down, there should not be any functional impact
    on monitoring pods. All the data/metrics should be collected correctly.
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @workloads
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, test_fixture):
        """
        Test case to validate when the prometheus pod is down and
        interaction with prometheus
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        aws_obj = aws.AWS()

        # Get all the openshift-monitoring pods
        monitoring_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE)

        # Get the worker node list
        workers = get_typed_nodes(node_type='worker')

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            prometheus_node = [
                node for node in workers
                if node.get().get('metadata').get('name') == prometheus_node
            ]

            # Make one of the node down where the prometheus pod is hosted
            instances = aws.get_instances_ids_and_names(prometheus_node)
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the monitoring pods are up
        for pod_obj in monitoring_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING)

        # Check for the created pvc metrics after nodes restarting
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after restarting nodes
        namespaces = helpers.create_multilpe_projects(number_of_project=1)
        namespace_list.extend(namespaces)

        # Create pvcs after restarting nodes
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after restarting nodes
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod after restarting nodes
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )
class TestAMQNodeReboot(E2ETest):
    """
    Test case to reboot or shutdown and recovery
    node when amq workload is running

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady
        for situations in which the test failed in between

        """
        def finalizer():

            # Validate all nodes are in READY state
            not_ready_nodes = [
                n for n in get_node_objs() if n.ocp.get_resource_status(n.name)
                == constants.NODE_NOT_READY
            ]
            log.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes_by_stop_and_start(not_ready_nodes)
                wait_for_nodes_status()

            log.info("All nodes are in Ready status")

        request.addfinalizer(finalizer)

    @pytest.fixture()
    def amq_setup(self, amq_factory_fixture):
        """
        Creates amq cluster and run benchmarks
        """
        sc_name = default_storage_class(interface_type=constants.CEPHBLOCKPOOL)
        self.amq, self.threads = amq_factory_fixture(sc_name=sc_name.name)

    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*['worker'],
                         marks=pytest.mark.polarion_id("OCS-1282")),
            pytest.param(*['master'],
                         marks=pytest.mark.polarion_id("OCS-1281"))
        ])
    def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup):
        """
        Test case to validate rebooting master node shouldn't effect
        amq workloads running in background

        """
        # Get all amq pods
        pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE)

        # Get the node list
        node = get_typed_nodes(node_type, num_of_nodes=1)

        # Reboot one master nodes
        nodes.restart_nodes(node, wait=False)

        # Wait some time after rebooting master
        waiting_time = 40
        log.info(f"Waiting {waiting_time} seconds...")
        time.sleep(waiting_time)

        # Validate all nodes and services are in READY state and up
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=60,
              delay=15)(ocp.wait_for_cluster_connectivity(tries=400))
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=60,
              delay=15)(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all amq pods are up and running
        assert POD.wait_for_resource(condition='Running',
                                     resource_count=len(pod_obj_list),
                                     timeout=300)

        # Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)

    @pytest.mark.polarion_id("OCS-1278")
    def test_amq_after_shutdown_and_recovery_worker_node(
            self, nodes, amq_setup):
        """
        Test case to validate shutdown and recovery node
        shouldn't effect amq workloads running in background

        """
        # Get all amq pods
        pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE)

        # Get the node list
        node = get_typed_nodes(node_type='worker', num_of_nodes=1)

        # Reboot one master nodes
        nodes.stop_nodes(nodes=node)

        waiting_time = 20
        log.info(f"Waiting for {waiting_time} seconds")
        time.sleep(waiting_time)

        nodes.start_nodes(nodes=node)

        # Validate all nodes are in READY state and up
        retry((CommandFailed, TimeoutError, AssertionError,
               ResourceWrongStatusException),
              tries=30,
              delay=15)(wait_for_nodes_status(timeout=1800))

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all amq pods are up and running
        assert POD.wait_for_resource(condition='Running',
                                     resource_count=len(pod_obj_list),
                                     timeout=300)

        # Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes - Reactive
    """
    threads = []

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()
            ceph_health_check()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(*['rbd', 'shutdown'],
                         marks=[
                             pytest.mark.polarion_id("OCS-2102"),
                             pytest.mark.bugzilla("1830015")
                         ]),
            pytest.param(*['rbd', 'terminate'],
                         marks=pytest.mark.polarion_id("OCS-2103")),
            pytest.param(*['cephfs', 'shutdown'],
                         marks=[
                             pytest.mark.polarion_id("OCS-2104"),
                             pytest.mark.bugzilla("1830015")
                         ]),
            pytest.param(*['cephfs', 'terminate'],
                         marks=pytest.mark.polarion_id("OCS-2105")),
        ])
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
            self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory,
            interface):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key='dc',
                          label_value='fedora')

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == 'rbd':
            interface = constants.CEPHBLOCKPOOL
        elif interface == 'cephfs':
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={'dc': 'fedora'})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Beispiel #28
0
class TestNodesRestart(ManageTest):
    """
    Test ungraceful cluster shutdown
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """
        def finalizer():
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(finalizer)

    @pytest.mark.parametrize(
        argnames=["force"],
        argvalues=[
            pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")),
            pytest.param(*[False],
                         marks=[
                             pytest.mark.polarion_id("OCS-895"),
                             aws_platform_required
                         ])
        ])
    def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force):
        """
        Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs)

        """
        ocp_nodes = get_node_objs()
        nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force)
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla('1754287')
    @pytest.mark.polarion_id("OCS-2015")
    def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory):
        """
        Test restart nodes one after the other and check health status in between

        """
        ocp_nodes = get_node_objs()
        for node in ocp_nodes:
            nodes.restart_nodes(nodes=[node], wait=False)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @pytest.mark.parametrize(
        argnames=["interface", "operation"],
        argvalues=[
            pytest.param(*['rbd', 'create_resources'],
                         marks=pytest.mark.polarion_id("OCS-1138")),
            pytest.param(*['rbd', 'delete_resources'],
                         marks=pytest.mark.polarion_id("OCS-1241")),
            pytest.param(*['cephfs', 'create_resources'],
                         marks=pytest.mark.polarion_id("OCS-1139")),
            pytest.param(*['cephfs', 'delete_resources'],
                         marks=pytest.mark.polarion_id("OCS-1242"))
        ])
    def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node(
            self, nodes, pvc_factory, pod_factory, interface, operation):
        """
        Test PV provisioning under degraded state -
        stop the node that has the provisioner pod running on

        OCS-1138:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        provisioner_pods = None
        # Get the provisioner pod according to the interface
        if interface == 'rbd':
            provisioner_pods = pod.get_rbdfsplugin_provisioner_pods()
        elif interface == 'cephfs':
            provisioner_pods = pod.get_cephfsplugin_provisioner_pods()
        provisioner_pod = provisioner_pods[0]

        # Making sure that the node is not running the rook operator pod:
        provisioner_node = pod.get_pod_node(provisioner_pod)
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get(
                'name') == provisioner_node.get().get('metadata').get('name'):
            provisioner_pod = provisioner_pods[1]

        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get().get('metadata').get(
            'name')
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Stopping the nodes
        nodes.stop_nodes(nodes=[provisioner_node])

        # Wait for the provisioner pod to get to running status
        selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if (
            interface
            == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for {interface} provisioner pod to reach status Running")
        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2
        ), f"{interface} provisioner pod failed to reach status Running"

        logger.info(f"{interface} provisioner pod has reached status Running")
        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[provisioner_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()

    @pytest.mark.parametrize(
        argnames=["operation"],
        argvalues=[
            pytest.param(*['create_resources'],
                         marks=[pytest.mark.polarion_id("OCS-2016")]),
            pytest.param(*['delete_resources'],
                         marks=[pytest.mark.polarion_id("OCS-2017")]),
        ])
    def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node(
            self, nodes, pvc_factory, pod_factory, operation):
        """
        Test PV provisioning under degraded state -
        stop the node that has the rook operator pod running on

        OCS-2016:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-2017:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources
        - Start the worker node
        - Check cluster and Ceph health
        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        rook_operator_pods = pod.get_operator_pods()
        rook_operator_pod = rook_operator_pods[0]

        rook_operator_pod_name = rook_operator_pod.name
        logger.info(f"rook operator pod found: {rook_operator_pod_name}")

        # Get the node name that has the rook operator pod running on
        operator_node = pod.get_pod_node(rook_operator_pod)
        operator_node_name = operator_node.get().get('metadata').get('name')
        logger.info(
            f"{rook_operator_pod_name} pod is running on node {operator_node_name}"
        )

        # Stopping the node
        nodes.stop_nodes(nodes=[operator_node])

        # Wait for the rook operator pod to get to running status
        selector = constants.OPERATOR_LABEL

        # Wait for the rook operator pod to reach Terminating status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Terminating"
        )
        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=rook_operator_pod_name,
            condition=constants.STATUS_TERMINATING
        ), "rook operator pod failed to reach status Terminating"
        logger.info(
            f"Pod {rook_operator_pod_name} has reached status Terminating")

        # Wait for the rook operator pod to be started and reach running status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Running"
        )

        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=1
        ), "rook operator pod failed to reach status Running"
        logger.info("rook operator pod has reached status Running")

        assert wait_for_ct_pod_recovery(
        ), "Ceph tools pod failed to come up on another node"

        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)

            self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[operator_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()
Beispiel #29
0
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Proactive

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20)
                pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20)
                pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

        if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM:
            if config.ENV_DATA['deployment_type'] == 'ipi':
                node.delete_and_create_osd_node_aws_ipi(osd_node_name)

            elif config.ENV_DATA['deployment_type'] == 'upi':
                node.delete_and_create_osd_node_aws_upi(osd_node_name)
            else:
                pytest.fail(
                    f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, "
                    f"results of this test run are all invalid.")

        elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM:
            pytest.skip("Skipping add node in Vmware platform due to "
                        "https://bugzilla.redhat.com/show_bug.cgi?id=1844521"
                        )

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=30)
class TestPgSQLNodeReboot(E2ETest):
    """
    Test running PGSQL and with Ceph pods respin
    """
    @pytest.fixture()
    def pgsql_setup(self, pgsql):
        """
        PGSQL test setup
        """
        # Deployment of postgres database
        pgsql.setup_postgresql(replicas=3)

        # Initialize Sanity instance
        self.sanity_helpers = Sanity()

    @pytest.mark.usefixtures(pgsql_setup.__name__)
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=900,
                                  node_type='master'):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type='worker',
                                                   print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)