class TestNodeReplacementWithIO(ManageTest): """ Knip-894 Node replacement proactive with IO """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive_with_io_running(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive when IO running in the background """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_node_name = select_osd_node_name() log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) delete_and_create_osd_node(osd_node_name) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120)
def test_deployment(pvc_factory, pod_factory): deploy = config.RUN['cli_params'].get('deploy') teardown = config.RUN['cli_params'].get('teardown') if not teardown or deploy: log.info("Verifying OCP cluster is running") assert is_cluster_running(config.ENV_DATA['cluster_path']) if not config.ENV_DATA['skip_ocs_deployment']: ocs_registry_image = config.DEPLOYMENT.get('ocs_registry_image') ocs_install_verification(ocs_registry_image=ocs_registry_image) nb_eps = config.DEPLOYMENT.get('noobaa_endpoints') if nb_eps > 1: change_noobaa_endpoints_count(nb_eps) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources sanity_helpers = Sanity() sanity_helpers.health_check() sanity_helpers.create_resources(pvc_factory, pod_factory) sanity_helpers.delete_resources() if teardown: log.info( "Cluster will be destroyed during teardown part of this test.")
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM: if config.ENV_DATA['deployment_type'] == 'ipi': node.delete_and_create_osd_node_aws_ipi(osd_node_name) elif config.ENV_DATA['deployment_type'] == 'upi': node.delete_and_create_osd_node_aws_upi(osd_node_name) else: pytest.fail( f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, " f"results of this test run are all invalid.") elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=30)
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2101")), ]) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes - Reactive """ threads = [] @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param(*['rbd', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2102"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['rbd', 'terminate'], marks=pytest.mark.polarion_id("OCS-2103")), pytest.param(*['cephfs', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2104"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['cephfs', 'terminate'], marks=pytest.mark.polarion_id("OCS-2105")), ]) def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ] ) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) node.drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.skipif( condition=config.ENV_DATA['platform'] != 'AWS', reason="Tests are not running on AWS deployed cluster" ) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1293")) ] ) def test_node_maintenance_restart_activate( self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type ): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node's ec2 instance - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_node, f"Failed to find a {node_type} node for the test" typed_node_name = typed_node[0].name # Maintenance the node (unschedule and drain). The function contains logging node.drain_nodes([typed_node_name]) instance = aws.get_instances_ids_and_names(typed_node) assert instance, f"Failed to get ec2 instances for node {typed_node_name}" # Restarting ec2 instance aws_obj.restart_ec2_instances(instances=instance, wait=True) node.wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier2 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ] ) def test_2_nodes_maintenance_same_type( self, pvc_factory, pod_factory, nodes_type ): """ OCS-1273/OCs-1271: - Maintenance (mark as unscheduable and drain) 2 worker/master nodes - Mark the nodes as scheduable - Check cluster and Ceph health - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 2 nodes typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(typed_node_names) # Mark the nodes back to schedulable node.schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ node.get_typed_nodes( node_type=node_type, num_of_nodes=1 )[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable node.schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2101")), ]) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") else: raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesRestart(ManageTest): """ Test ungraceful cluster shutdown """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Make sure all nodes are up again """ def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() request.addfinalizer(finalizer) @pytest.mark.parametrize( argnames=["force"], argvalues=[ pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")), pytest.param(*[False], marks=[ pytest.mark.polarion_id("OCS-895"), aws_platform_required ]) ]) def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force): """ Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs) """ ocp_nodes = get_node_objs() nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla('1754287') @pytest.mark.polarion_id("OCS-2015") def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_node_objs() for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.sanity_helpers.create_resources(pvc_factory, pod_factory) @pytest.mark.parametrize( argnames=["interface", "operation"], argvalues=[ pytest.param(*['rbd', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1138")), pytest.param(*['rbd', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1241")), pytest.param(*['cephfs', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1139")), pytest.param(*['cephfs', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1242")) ]) def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node( self, nodes, pvc_factory, pod_factory, interface, operation): """ Test PV provisioning under degraded state - stop the node that has the provisioner pod running on OCS-1138: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1241: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1139: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1242: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) provisioner_pods = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pods = pod.get_rbdfsplugin_provisioner_pods() elif interface == 'cephfs': provisioner_pods = pod.get_cephfsplugin_provisioner_pods() provisioner_pod = provisioner_pods[0] # Making sure that the node is not running the rook operator pod: provisioner_node = pod.get_pod_node(provisioner_pod) rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get( 'name') == provisioner_node.get().get('metadata').get('name'): provisioner_pod = provisioner_pods[1] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get().get('metadata').get( 'name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Stopping the nodes nodes.stop_nodes(nodes=[provisioner_node]) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=600, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" logger.info( f"Pod {provisioner_pod_name} has reached status Terminating") # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for {interface} provisioner pod to reach status Running") # After this change https://github.com/rook/rook/pull/3642/, there are # 2 provisioners for each interface assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=2 ), f"{interface} provisioner pod failed to reach status Running" logger.info(f"{interface} provisioner pod has reached status Running") if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[provisioner_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check() @pytest.mark.parametrize( argnames=["operation"], argvalues=[ pytest.param(*['create_resources'], marks=[pytest.mark.polarion_id("OCS-2016")]), pytest.param(*['delete_resources'], marks=[pytest.mark.polarion_id("OCS-2017")]), ]) def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node( self, nodes, pvc_factory, pod_factory, operation): """ Test PV provisioning under degraded state - stop the node that has the rook operator pod running on OCS-2016: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-2017: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources - Start the worker node - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) rook_operator_pods = pod.get_operator_pods() rook_operator_pod = rook_operator_pods[0] rook_operator_pod_name = rook_operator_pod.name logger.info(f"rook operator pod found: {rook_operator_pod_name}") # Get the node name that has the rook operator pod running on operator_node = pod.get_pod_node(rook_operator_pod) operator_node_name = operator_node.get().get('metadata').get('name') logger.info( f"{rook_operator_pod_name} pod is running on node {operator_node_name}" ) # Stopping the node nodes.stop_nodes(nodes=[operator_node]) # Wait for the rook operator pod to get to running status selector = constants.OPERATOR_LABEL # Wait for the rook operator pod to reach Terminating status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Terminating" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, resource_name=rook_operator_pod_name, condition=constants.STATUS_TERMINATING ), "rook operator pod failed to reach status Terminating" logger.info( f"Pod {rook_operator_pod_name} has reached status Terminating") # Wait for the rook operator pod to be started and reach running status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Running" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=1 ), "rook operator pod failed to reach status Running" logger.info("rook operator pod has reached status Running") assert wait_for_ct_pod_recovery( ), "Ceph tools pod failed to come up on another node" if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[operator_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) # Unscheduling node node.unschedule_nodes([osd_node_name]) # Draining Node node.drain_nodes([osd_node_name]) log.info("Getting machine name from specified node name") machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"Node {osd_node_name} associated machine is {machine_name}") log.info( f"Deleting machine {machine_name} and waiting for new machine to come up" ) machine.delete_machine_and_check_state_of_new_spinned_machine( machine_name) new_machine_list = machine.get_machines() for machines in new_machine_list: # Trimming is done to get just machine name # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b if re.match(machines.name[:-6], machine_name): new_machine_name = machines.name machineset_name = machine.get_machineset_from_machine_name( new_machine_name) log.info("Waiting for new worker node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) new_node_name = node.get_node_from_machine_name(new_machine_name) log.info("Adding ocs label to newly created worker node") node_obj = ocp.OCP(kind='node') node_obj.add_label(resource_name=new_node_name, label=constants.OPERATOR_NODE_LABEL) log.info( f"Successfully labeled {new_node_name} with OCS storage label") # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check()
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def health_checker(self): """ Check Ceph health """ try: status = ceph_health_check_base() if status: log.info("Health check passed") except CephHealthException as e: # skip because ceph is not in good health pytest.skip(str(e)) @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ]) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=[ pytest.mark.polarion_id("OCS-1293"), bugzilla('1754287') ]) ]) def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier3 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ]) def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: log.info( f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_typed_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2128")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2129")), ]) def test_simultaneous_drain_of_two_ocs_nodes(self, pvc_factory, pod_factory, dc_pod_factory, interface): """ OCS-2128/OCS-2129: - Create PVCs and start IO on DC based app pods - Add one extra node in two of the AZs and label the nodes with OCS storage label - Maintenance (mark as unscheduable and drain) 2 worker nodes simultaneously - Confirm that OCS and DC pods are in running state - Remove unscheduled nodes - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Check cluster and Ceph health """ # Get OSD running nodes osd_running_worker_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_worker_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_worker_nodes, label_key='dc', label_value='fedora') log.info("Successfully labeled worker nodes with {dc:fedora}") # Create DC app pods log.info("Creating DC based app pods and starting IO in background") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get the machine name using the node name machine_names = [ machine.get_machine_from_node_name(osd_running_worker_node) for osd_running_worker_node in osd_running_worker_nodes[:2] ] log.info(f"{osd_running_worker_nodes} associated " f"machine are {machine_names}") # Get the machineset name using machine name machineset_names = [ machine.get_machineset_from_machine_name(machine_name) for machine_name in machine_names ] log.info(f"{osd_running_worker_nodes} associated machineset " f"is {machineset_names}") # Add a new node and label it add_new_node_and_label_it(machineset_names[0]) add_new_node_and_label_it(machineset_names[1]) # Drain 2 nodes drain_nodes(osd_running_worker_nodes[:2]) # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # DC app pods on the drained node will get automatically created on other # running node in same AZ. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Remove unscheduled nodes # In scenarios where the drain is attempted on >3 worker setup, # post completion of drain we are removing the unscheduled nodes so # that we maintain 3 worker nodes. log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}") remove_node_objs = get_node_objs(osd_running_worker_nodes[:2]) remove_nodes(remove_node_objs) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Reactive """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param( *['rbd', 'power off'], marks=pytest.mark.polarion_id("OCS-2118") ), pytest.param( *['rbd', 'network failure'], marks=pytest.mark.polarion_id("OCS-2120") ), pytest.param( *['cephfs', 'power off'], marks=pytest.mark.polarion_id("OCS-2119") ), pytest.param( *['cephfs', 'network failure'], marks=pytest.mark.polarion_id("OCS-2121") ), ] ) def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory( interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name ) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name ) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}" ) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine( annotation=annotation, machine_name=machine_name ) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes) ) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind='node') node_obj.add_label( resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL ) log.info( f"Successfully labeled {new_spun_node} with OCS storage label" ) # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state( dc_pod_obj, timeout=1200 ) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=1800 ) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesRestart(ManageTest): """ Test ungraceful cluster shutdown """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["force"], argvalues=[ pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")), pytest.param(*[False], marks=pytest.mark.polarion_id("OCS-895")) ]) def test_nodes_restart_aws(self, ec2_instances, aws_obj, pvc_factory, pod_factory, force): """ Test ungraceful cluster shutdown - AWS """ aws_obj.restart_ec2_instances(instances=ec2_instances, wait=True, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @pytest.mark.parametrize( argnames=["interface", "operation"], argvalues=[ pytest.param(*['rbd', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1138")), pytest.param(*['rbd', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1241")), pytest.param(*['cephfs', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1139")), pytest.param(*['cephfs', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1242")) ]) def test_pv_provisioning_under_degraded_state(self, ec2_instances, aws_obj, pvc_factory, pod_factory, interface, operation): """ Test PV provisioning under degraded state OCS-1138: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1241: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1139: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1242: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) provisioner_pod = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pod = pod.get_rbdfsplugin_provisioner_pods()[0] elif interface == 'cephfs': provisioner_pod = pod.get_cephfsplugin_provisioner_pods()[0] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get('metadata').get('name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Get the ec2 instance of the node instances = aws.get_instances_ids_and_names([provisioner_node]) assert instances, ( f"Failed to get ec2 instances for node {provisioner_node_name}") # Stopping the nodes aws_obj.stop_ec2_instances(instances=instances, wait=True) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=600, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" logger.info( f"Pod {provisioner_pod_name} has reached status Terminating") # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Running") logger.info(f"Pod {provisioner_pod_name} has reached status Running") # After this change https://github.com/rook/rook/pull/3642/, there are # 2 provisioners for each interface assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=2 ), f"{interface} provisioner pod failed to reach status Running" if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes aws_obj.start_ec2_instances(instances=instances, wait=True) # Checking cluster and Ceph health self.sanity_helpers.health_check()
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ]) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=[ pytest.mark.polarion_id("OCS-1293"), bugzilla('1754287') ]) ]) def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier3 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ]) def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: logger.info( f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_typed_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # 'rook-ceph-crashcollector' on the failed node stucks at pending # state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as WA and # deleting its deployment so that the pod disappears # Will revert this WA once the BZ is fixed except ResourceWrongStatusException: if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP() name = pod_obj.name[:-17] command = f"delete deployment {name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes - Reactive """ threads = [] @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param(*['rbd', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2102"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['rbd', 'terminate'], marks=pytest.mark.polarion_id("OCS-2103")), pytest.param(*['cephfs', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2104"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['cephfs', 'terminate'], marks=pytest.mark.polarion_id("OCS-2105")), ]) def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") continue helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=240) except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()