def test_deployment(pvc_factory, pod_factory): deploy = config.RUN['cli_params'].get('deploy') teardown = config.RUN['cli_params'].get('teardown') if not teardown or deploy: log.info("Verifying OCP cluster is running") assert is_cluster_running(config.ENV_DATA['cluster_path']) if not config.ENV_DATA['skip_ocs_deployment']: ocs_registry_image = config.DEPLOYMENT.get('ocs_registry_image') ocs_install_verification(ocs_registry_image=ocs_registry_image) nb_eps = config.DEPLOYMENT.get('noobaa_endpoints') if nb_eps > 1: change_noobaa_endpoints_count(nb_eps) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources sanity_helpers = Sanity() sanity_helpers.health_check() sanity_helpers.create_resources(pvc_factory, pod_factory) sanity_helpers.delete_resources() if teardown: log.info( "Cluster will be destroyed during teardown part of this test.")
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self): """ Knip-894 Node Replacement proactive(without IO running) """ osd_node_name = select_osd_node_name() delete_and_create_osd_node(osd_node_name) # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=90) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=1800), ("Data re-balance failed to complete")
class TestNodeReplacementWithIO(ManageTest): """ Knip-894 Node replacement proactive with IO """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive_with_io_running(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive when IO running in the background """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_node_name = select_osd_node_name() log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) delete_and_create_osd_node(osd_node_name) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120)
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestCouchBaseNodeReboot(E2ETest): """ Deploy an CouchBase workload using operator """ @pytest.fixture() def cb_setup(self, couchbase_factory_fixture): """ Creates couchbase workload """ self.cb = couchbase_factory_fixture(replicas=3, run_in_bg=True, skip_analyze=True) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["pod_name_of_node"], argvalues=[ pytest.param(*['osd'], marks=pytest.mark.polarion_id("OCS-776")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-783")), pytest.param(*['couchbase'], marks=pytest.mark.polarion_id("OCS-776")) ]) def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ if pod_name_of_node == 'couchbase': node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == 'osd': node_list = get_osd_running_nodes() elif pod_name_of_node == 'master': node_list = get_master_nodes() node_1 = get_node_objs(node_list[random.randint(0, len(node_list) - 1)]) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) get_node_resource_utilization_from_adm_top(node_type='master', print_table=True) # Restart relevant node nodes.restart_nodes(node_1) for sample in TimeoutSampler(300, 5, self.cb.result.done): if sample: break else: logging.info( "#### ....Waiting for couchbase threads to complete...") self.sanity_helpers.health_check()
class TestPgSQLNodeReboot(E2ETest): """ Test running PGSQL and with Ceph pods respin """ @pytest.fixture() def pgsql_setup(self, pgsql): """ PGSQL test setup """ # Deployment of postgres database pgsql.setup_postgresql(replicas=3) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.usefixtures(pgsql_setup.__name__) def test_run_pgsql_node_drain( self, pgsql, transactions=900, node_type='master' ): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark( replicas=3, transactions=transactions, clients=3 ) # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Node drain with specific node type typed_nodes = node.get_typed_nodes( node_type=node_type, num_of_nodes=1 ) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
class TestCouchBaseNodeDrain(E2ETest): """ Deploy an CouchBase workload using operator """ @pytest.fixture() def cb_setup(self, couchbase_factory_fixture): """ Creates couchbase workload """ self.cb = couchbase_factory_fixture( replicas=3, run_in_bg=True, skip_analyze=True ) # Initialize Sanity instance self.sanity_helpers = Sanity() def test_run_couchbase_node_drain(self, cb_setup, node_type='master'): """ Test couchbase workload with node drain """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top( node_type='worker', print_table=True ) # Node drain with specific node type typed_nodes = node.get_typed_nodes( node_type=node_type, num_of_nodes=1 ) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() for sample in TimeoutSampler(300, 5, self.cb.result.done): if sample: break else: logging.info( "#### ....Waiting for couchbase threads to complete..." ) utils.ceph_health_check()
class TestCouchBaseNodeDrain(E2ETest): """ Deploy an CouchBase workload using operator """ @pytest.fixture() def cb_setup(self, couchbase_factory_fixture): """ Creates couchbase workload """ self.cb = couchbase_factory_fixture(replicas=3, run_in_bg=True, skip_analyze=True) # Initialize Sanity instance self.sanity_helpers = Sanity() def test_run_couchbase_node_drain(self, cb_setup, node_type='master'): """ Test couchbase workload with node drain """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) # Node drain with specific node type typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] bg_handler.wait_for_bg_operations(bg_ops, timeout=3600) self.sanity_helpers.health_check()
class TestCouchBasePodRespin(E2ETest): """ Deploy an CouchBase workload using operator """ @pytest.fixture() def cb_setup(self, couchbase_factory_fixture): """ Creates couchbase workload """ self.cb = couchbase_factory_fixture(replicas=3, run_in_bg=True, skip_analyze=True) self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["pod_name"], argvalues=[ pytest.param(*['osd'], marks=pytest.mark.polarion_id("OCS-780")), pytest.param(*['mon'], marks=pytest.mark.polarion_id("OCS-779")), pytest.param(*['mgr'], marks=pytest.mark.polarion_id("OCS-781")), pytest.param(*['couchbase'], marks=pytest.mark.polarion_id("OCS-786")), ]) def test_run_couchbase_respin_pod(self, cb_setup, pod_name): log.info(f"Respin Ceph pod {pod_name}") if pod_name == 'couchbase': self.cb.respin_couchbase_app_pod() else: disruption = Disruptions() disruption.set_resource(resource=f'{pod_name}') disruption.delete_resource() bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] bg_handler.wait_for_bg_operations(bg_ops, timeout=3600) self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2101")), ]) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestJenkinsNodeDrain(E2ETest): """ Test running Jenkins and Node Drain """ @pytest.fixture() def jenkins_setup(self, jenkins): """ JENKINS test setup """ # Initialize Sanity instance self.sanity_helpers = Sanity() # Deployment of jenkins jenkins.create_ocs_jenkins_template() @pytest.mark.parametrize( argnames=['node_type', 'num_projects', 'num_of_builds'], argvalues=[ pytest.param(*[WORKER_MACHINE, 4, 3], marks=pytest.mark.polarion_id("OCS-2177")), pytest.param(*[MASTER_MACHINE, 3, 6], marks=pytest.mark.polarion_id("OCS-2176")), ]) @pytest.mark.usefixtures(jenkins_setup.__name__) def test_run_jenkins_drain_node(self, jenkins, node_type, num_projects, num_of_builds): """ Test Node Drain jenkins """ # Init number of projects jenkins.number_projects = num_projects # Create app jenkins jenkins.create_app_jenkins() # Create jenkins pvc jenkins.create_jenkins_pvc() # Create jenkins build config jenkins.create_jenkins_build_config() # Wait jenkins deploy pod reach to completed state jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED) # Get relevant node nodes_drain = jenkins.get_node_name_where_jenkins_pod_not_hosted( node_type=node_type, num_of_nodes=1) # Init number of builds per project jenkins.number_builds_per_project = num_of_builds # Start Builds jenkins.start_build() if len(nodes_drain) > 0: # Node maintenance - to gracefully terminate all pods on the node drain_nodes(nodes_drain) # Make the node schedulable again schedule_nodes(nodes_drain) # Wait build reach 'Complete' state jenkins.wait_for_build_to_complete() # Print table of builds jenkins.print_completed_builds_results() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestPgSQLNodeReboot(E2ETest): """ Test running PGSQL and with Ceph pods respin """ @pytest.fixture() def pgsql_setup(self, pgsql): """ PGSQL test setup """ # Deployment of postgres database pgsql.setup_postgresql(replicas=3) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["transactions", "pod_name"], argvalues=[ pytest.param(*[600, 'osd'], marks=pytest.mark.polarion_id("OCS-801")), pytest.param(*[600, 'postgres'], marks=pytest.mark.polarion_id("OCS-799")) ]) @pytest.mark.usefixtures(pgsql_setup.__name__) def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Choose a node based on pod it contains if pod_name == 'postgres': node_list = pgsql.get_pgsql_nodes() elif pod_name == 'osd': node_list = get_osd_running_nodes() node_1 = get_node_objs(node_list[random.randint(0, len(node_list) - 1)]) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) # Restart relevant node nodes.restart_nodes(node_1) # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestDetachAttachWorkerVolume(ManageTest): """ Test class for detach and attach worker volume """ @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady, for situations in which the test failed before restarting the node after detach volume, which leaves nodes in NotReady """ def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.polarion_id("OCS-1085") def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Wait for the volumes to be re-attached back to the worker node - Restart the node so the volume will get re-mounted """ # Get a data volume data_volume = nodes.get_data_volumes()[0] # Get the worker node according to the volume attachment worker = nodes.get_node_by_attached_volume(data_volume) # Detach volume (logging is done inside the function) nodes.detach_volume(data_volume) # Validate cluster is still functional try: # In case the selected node that its volume disk was detached was the one # running the ceph tools pod, we'll need to wait for a new ct pod to start. # For that, a function that connects to the ct pod is being used to check if # it's alive _ = get_admin_key() except CommandFailed as ex: if "connection timed out" in str(ex): logger.info( "Ceph tools box was running on the node that its data " "volume has been detached. Hence, waiting for a new " "Ceph tools box pod to spin up") wait_for_resource_count_change( func_to_use=get_all_pods, previous_num=1, namespace=config.ENV_DATA['cluster_namespace'], timeout=120, selector='app=rook-ceph-tools') else: raise finally: self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(data_volume), ( f"Volume {data_volume.id} failed to be re-attached to a worker node" ) # Restart the instance so the volume will get re-mounted nodes.restart_nodes([worker]) # Cluster health check self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data 2 of the data volumes from their worker nodes - Wait for the volumes to be re-attached back to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Get 2 data volumes data_volumes = nodes.get_data_volumes()[:2] workers_and_volumes = [{ 'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol } for vol in data_volumes] for worker_and_volume in workers_and_volumes: # Detach the volume (logging is done inside the function) nodes.detach_volume(worker_and_volume['volume']) for worker_and_volume in workers_and_volumes: # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(worker_and_volume['volume']), ( f"Volume {worker_and_volume['volume']} " f"failed to be re-attached to a worker node") # Restart the instances so the volume will get re-mounted nodes.restart_nodes([ worker_and_volume['worker'] for worker_and_volume in workers_and_volumes ]) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ] ) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) node.drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.skipif( condition=config.ENV_DATA['platform'] != 'AWS', reason="Tests are not running on AWS deployed cluster" ) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1293")) ] ) def test_node_maintenance_restart_activate( self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type ): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node's ec2 instance - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_node, f"Failed to find a {node_type} node for the test" typed_node_name = typed_node[0].name # Maintenance the node (unschedule and drain). The function contains logging node.drain_nodes([typed_node_name]) instance = aws.get_instances_ids_and_names(typed_node) assert instance, f"Failed to get ec2 instances for node {typed_node_name}" # Restarting ec2 instance aws_obj.restart_ec2_instances(instances=instance, wait=True) node.wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier2 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ] ) def test_2_nodes_maintenance_same_type( self, pvc_factory, pod_factory, nodes_type ): """ OCS-1273/OCs-1271: - Maintenance (mark as unscheduable and drain) 2 worker/master nodes - Mark the nodes as scheduable - Check cluster and Ceph health - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 2 nodes typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(typed_node_names) # Mark the nodes back to schedulable node.schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ node.get_typed_nodes( node_type=node_type, num_of_nodes=1 )[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable node.schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestJenkinsNodeReboot(E2ETest): """ Test running Jenkins and Node Reboot """ @pytest.fixture() def jenkins_setup(self, jenkins): """ JENKINS test setup """ # Initialize Sanity instance self.sanity_helpers = Sanity() # Deployment of jenkins jenkins.create_ocs_jenkins_template() @pytest.mark.parametrize( argnames=['node_type', 'num_projects', 'num_of_builds'], argvalues=[ pytest.param(*[MASTER_MACHINE, 2, 15], marks=pytest.mark.polarion_id("OCS-2202")), pytest.param(*[WORKER_MACHINE, 2, 15], marks=pytest.mark.polarion_id("OCS-2178")), ]) @pytest.mark.usefixtures(jenkins_setup.__name__) def test_run_jenkins_node_reboot(self, jenkins, nodes, node_type, num_projects, num_of_builds): """ Test Node Reboot jenkins """ # Init number of projects jenkins.number_projects = num_projects # Create app jenkins jenkins.create_app_jenkins() # Create jenkins pvc jenkins.create_jenkins_pvc() # Create jenkins build config jenkins.create_jenkins_build_config() # Wait jenkins deploy pod reach to completed state jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED) # Get relevant node nodes_reboot = jenkins.get_node_name_where_jenkins_pod_not_hosted( node_type=node_type, num_of_nodes=1) # Init number of builds per project jenkins.number_builds_per_project = num_of_builds # Start Builds jenkins.start_build() if len(nodes_reboot) > 0: # Restart Node nodes.restart_nodes(get_node_objs(nodes_reboot)) else: log.info('No node was reboot') # Wait build reach 'Complete' state jenkins.wait_for_build_to_complete() # Print table of builds jenkins.print_completed_builds_results() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2101")), ]) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") else: raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestMonitoringBackedByOCS(E2ETest): """ Test cases to validate monitoring backed by OCS """ num_of_pvcs = 5 pvc_size = 5 @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady or unschedulable, for situations in which the test failed in between restarting or scheduling those nodes """ def finalizer(): # Validate all nodes are schedulable scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") assert prometheus_health_check(), "Prometheus health is degraded" request.addfinalizer(finalizer) @pytest.fixture() def pods(self, multi_pvc_factory, dc_pod_factory): """ Prepare multiple dc pods for the test Returns: list: Pod instances """ sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) pvc_objs = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL, storageclass=sc, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) pod_objs = [] for pvc_obj in pvc_objs: pod_objs.append(dc_pod_factory(pvc=pvc_obj)) # Check for the created pvc metrics on prometheus pod for pod_obj in pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) return pod_objs @pytest.mark.polarion_id("OCS-576") def test_monitoring_after_restarting_prometheus_pod(self, pods): """ Test case to validate prometheus pod restart should not have any functional impact """ # Get the prometheus pod prometheus_pod_obj = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_object in prometheus_pod_obj: # Get the pvc which mounted on prometheus pod pod_info = pod_object.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Restart the prometheus pod pod_object.delete(force=True) pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert pod_obj.wait_for_resource(condition='Running', selector=f'app=prometheus', timeout=60) # Check the same pvc is mounted on new pod pod_info = pod_object.get() assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_object.name}" ) for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-579") def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-580") def test_monitoring_after_respinning_ceph_pods(self, pods): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ['mgr', 'mon', 'osd'] disruption = Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-605") def test_monitoring_when_osd_down(self, pods): """ Test case to validate monitoring when osd is down """ # Get osd pods osd_pod_list = pod.get_osd_pods() # Make one of the osd down(first one) resource_name = osd_pod_list[0].get().get('metadata').get('name') assert modify_osd_replica_count(resource_name=resource_name, replica_count=0) # Validate osd is down pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_obj.wait_for_delete(resource_name=resource_name), ( f"Resources is not deleted {resource_name}") # Check for the created pvc metrics when osd is down for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) # Make osd up which was down assert modify_osd_replica_count(resource_name=resource_name, replica_count=1) # Validate osd is up and ceph health is ok self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-606") def test_monitoring_when_one_of_the_prometheus_node_down( self, nodes, pods): """ Test case to validate when the prometheus pod is down and its interaction with prometheus """ # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted pod_node_obj = pod.get_pod_node(pod_obj) # Make one of the node down where the prometheus pod is hosted nodes.restart_nodes([pod_node_obj]) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the prometheus pods are up for pod_obj in pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" ) @pytest.mark.polarion_id("OCS-709") def test_monitoring_after_rebooting_master_node(self, nodes, pods): """ Test case to validate rebooting master node shouldn't delete the data collected on prometheus pod """ # Get the master node list master_nodes = get_typed_nodes(node_type='master') # Reboot one after one master nodes for node in master_nodes: nodes.restart_nodes([node]) wait_for_nodes_status_and_prometheus_health_check(pods) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-710") def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state wait_for_nodes_status() # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-711") def test_monitoring_shutdown_and_recovery_prometheus_node( self, nodes, pods): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact """ # Get all prometheus pods prometheus_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for prometheus_pod_obj in prometheus_pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj) # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted nodes.stop_nodes([prometheus_node_obj]) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes([prometheus_node_obj]) # Validate all nodes are in READY state wait_for_nodes_status() # Check all the prometheus pods are up for pod_obj in prometheus_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
class FlowOperations: """ Flow based operations class """ def __init__(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def validate_cluster(self, cluster_check=False, node_status=False, pod_status=False, operation_name=""): """ Validates various ceph and ocs cluster checks Args: node_status (bool): Verifies node is Ready pod_status (bool): Verifies StorageCluster pods in expected state operation_name (str): Name of the operation, to Tag """ logger.info(f"{operation_name}: Verifying cluster health") assert ceph_health_check( defaults.ROOK_CLUSTER_NAMESPACE, tries=100), "Entry criteria FAILED: Cluster is Unhealthy" if cluster_check: self.sanity_helpers.health_check(tries=100) if node_status: logger.info(f"{operation_name}: Verifying whether node is ready") wait_for_nodes_status(status=constants.NODE_READY, timeout=300) if pod_status: logger.info( f"{operation_name}: Verifying StorageCluster pods are in running/completed state" ) assert check_pods_in_running_state( ), 'Some pods were not in expected state' def node_operations_entry_criteria(self, node_type, number_of_nodes, operation_name="Node Operation", network_fail_time=None): """ Entry criteria function for node related operations Args: node_type (str): Type of node number_of_nodes (int): Number of nodes operation_name (str): Name of the node operation network_fail_time (int): Total time to fail the network in a node Returns: tuple: containing the params used in Node operations """ self.validate_cluster(node_status=True, operation_name=operation_name) logger.info(f"Getting parameters related to: {operation_name}") typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=number_of_nodes) if network_fail_time: return typed_nodes, network_fail_time else: return typed_nodes def add_capacity_entry_criteria(self): """ Entry criteria verification function for add capacity operation Returns: tuple: containing the params used in add capacity exit operation """ self.validate_cluster(operation_name="Add Capacity") logger.info( "Add capacity: Getting restart count of pods before adding capacity" ) restart_count_before = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE) logger.info( "Add capacity entry: Getting OSD pod count before adding capacity") osd_pods_before = pod_helpers.get_osd_pods() return osd_pods_before, restart_count_before def add_capacity_exit_criteria(self, restart_count_before, osd_pods_before): """ Exit criteria function for Add capacity operation Args: restart_count_before (dict): Restart counts of pods osd_pods_before (list): List of OSD pods before """ self.validate_cluster(operation_name="Add Capacity") logger.info( "Add capacity: Getting restart count of pods after adding capacity" ) restart_count_after = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE) logger.info( f"Sum of restart count before = {sum(restart_count_before.values())}" ) logger.info( f"Sum of restart count after = {sum(restart_count_after.values())}" ) assert sum(restart_count_before.values()) == sum( restart_count_after.values() ), "Exit criteria verification FAILED: One or more pods got restarted" osd_pods_after = pod_helpers.get_osd_pods() number_of_osds_added = len(osd_pods_after) - len(osd_pods_before) logger.info( f"Number of OSDs added = {number_of_osds_added}, " f"before = {len(osd_pods_before)}, after = {len(osd_pods_after)}") assert number_of_osds_added == 3, "Exit criteria verification FAILED: osd count mismatch" logger.info("Add capacity: Exit criteria verification: Success")
class TestDiskFailures(ManageTest): """ Test class for detach and attach worker volume """ @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady, for situations in which the test failed before restarting the node after detach volume, which leaves nodes in NotReady """ def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @aws_platform_required @pytest.mark.polarion_id("OCS-1085") def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Wait for the volumes to be re-attached back to the worker node - Restart the node so the volume will get re-mounted """ # Get a data volume data_volume = nodes.get_data_volumes()[0] # Get the worker node according to the volume attachment worker = nodes.get_node_by_attached_volume(data_volume) # Detach volume (logging is done inside the function) nodes.detach_volume(data_volume, worker) # Validate cluster is still functional # In case the selected node that its volume disk was detached was the one # running the ceph tools pod, we'll need to wait for a new ct pod to start. # For that, a function that connects to the ct pod is being used to check if # it's alive assert wait_for_ct_pod_recovery(), "Ceph tools pod failed to come up on another node" self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(data_volume), ( "Volume failed to be re-attached to a worker node" ) # Restart the instance so the volume will get re-mounted nodes.restart_nodes([worker]) # Cluster health check # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster # becomes healthy eventually # TODO: Remove 'tries=100' self.sanity_helpers.health_check(tries=100) @aws_platform_required @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data 2 of the data volumes from their worker nodes - Wait for the volumes to be re-attached back to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Get 2 data volumes data_volumes = nodes.get_data_volumes()[:2] workers_and_volumes = [ {'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol} for vol in data_volumes ] for worker_and_volume in workers_and_volumes: # Detach the volume (logging is done inside the function) nodes.detach_volume( worker_and_volume['volume'], nodes.detach_volume(worker_and_volume['worker']) ) for worker_and_volume in workers_and_volumes: # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(worker_and_volume['volume']), ( f"Volume {worker_and_volume['volume']} " f"failed to be re-attached to a worker node" ) # Restart the instances so the volume will get re-mounted nodes.restart_nodes( [worker_and_volume['worker'] for worker_and_volume in workers_and_volumes] ) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla('1830702') @vsphere_platform_required @pytest.mark.polarion_id("OCS-2172") def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1787236#c16 """ logger.info("Picking a PV which will be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get('spec').get('claimRef').get('name') # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name][0] # Get the corresponding OSD pod logger.info(f"Getting the corresponding OSD pod of PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get() .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] # Get the node that has the OSD pod running on logger.info(f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) volume_size = osd_pvc.size osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get('metadata') .get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get('labels').get('job-name') osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the corresponding OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get() .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] # Delete the volume from the platform side logger.info(f"Deleting volume {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Delete the OSD deployment osd_deployment_name = osd_deployment.name logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) # Delete the OSD prepare job osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC osd_pvc_name = osd_pvc.name logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Recreate a volume from the platform side logger.info("Creating a replacing volume from the platform side") nodes.create_and_attach_volume(osd_node, volume_size) # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info("Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count ), ( f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info("Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count ), ( f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # Validate cluster is still functional self.sanity_helpers.health_check(tries=80) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestMonitoringBackedByOCS(E2ETest): """ Test cases to validate monitoring backed by OCS """ num_of_pvcs = 5 pvc_size = 5 @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady or unschedulable, for situations in which the test failed in between restarting or scheduling those nodes """ def finalizer(): # Validate all nodes are schedulable scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") assert prometheus_health_check(), "Prometheus health is degraded" request.addfinalizer(finalizer) @pytest.fixture() def pods(self, multi_pvc_factory, dc_pod_factory): """ Prepare multiple dc pods for the test Returns: list: Pod instances """ sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) pvc_objs = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL, storageclass=sc, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) pod_objs = [] for pvc_obj in pvc_objs: pod_objs.append(dc_pod_factory(pvc=pvc_obj)) # Check for the created pvc metrics on prometheus pod for pod_obj in pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) return pod_objs @pytest.mark.polarion_id("OCS-576") def test_monitoring_after_restarting_prometheus_pod(self, pods): """ Test case to validate prometheus pod restart should not have any functional impact """ # Get the prometheus pod prometheus_pod_obj = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_object in prometheus_pod_obj: # Get the pvc which mounted on prometheus pod pod_info = pod_object.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Restart the prometheus pod pod_object.delete(force=True) pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert pod_obj.wait_for_resource(condition='Running', selector='app=prometheus', timeout=60) # Check the same pvc is mounted on new pod pod_info = pod_object.get() assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_object.name}" ) for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-579") def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-580") def test_monitoring_after_respinning_ceph_pods(self, pods): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ['mgr', 'mon', 'osd'] disruption = Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-605") def test_monitoring_when_osd_down(self, pods): """ Test case to validate monitoring when osd is down """ # Get osd pods osd_pod_list = pod.get_osd_pods() # Make one of the osd down(first one) resource_name = osd_pod_list[0].get().get('metadata').get('name') assert modify_osd_replica_count(resource_name=resource_name, replica_count=0) # Validate osd is down pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_obj.wait_for_delete(resource_name=resource_name), ( f"Resources is not deleted {resource_name}") # Check for the created pvc metrics when osd is down for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) # Make osd up which was down assert modify_osd_replica_count(resource_name=resource_name, replica_count=1) # Validate osd is up and ceph health is ok self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-606") def test_monitoring_when_one_of_the_prometheus_node_down( self, nodes, pods): """ Test case to validate when the prometheus pod is down and its interaction with prometheus """ # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted pod_node_obj = pod.get_pod_node(pod_obj) # Make one of the node down where the prometheus pod is hosted nodes.restart_nodes([pod_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check all the prometheus pods are up for pod_obj in pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" ) @pytest.mark.polarion_id("OCS-709") def test_monitoring_after_rebooting_master_node(self, nodes, pods): """ Test case to validate rebooting master node shouldn't delete the data collected on prometheus pod """ # Get the master node list master_nodes = get_typed_nodes(node_type='master') # Reboot one after one master nodes for node in master_nodes: nodes.restart_nodes([node], wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) wait_for_nodes_status_and_prometheus_health_check(pods) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-710") def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod_obj.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-711") @skipif_aws_i3 def test_monitoring_shutdown_and_recovery_prometheus_node( self, nodes, pods): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact """ # Get all prometheus pods prometheus_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for prometheus_pod_obj in prometheus_pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj) # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted nodes.stop_nodes([prometheus_node_obj]) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=[prometheus_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check all the prometheus pods are up for pod_obj in prometheus_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) @pytest.mark.polarion_id("OCS-638") def test_monitoring_delete_pvc(self): """ Test case to validate whether delete pvcs+configmap and recovery of a node where monitoring pods running has no functional impact """ # Get 'cluster-monitoring-config' configmap ocp_configmap = ocp.OCP(namespace=constants.MONITORING_NAMESPACE, kind='configmap') configmap_dict = ocp_configmap.get( resource_name='cluster-monitoring-config') dir_configmap = tempfile.mkdtemp(prefix='configmap_') yaml_file = f'{dir_configmap}/configmap.yaml' templating.dump_data_to_temp_yaml(configmap_dict, yaml_file) # Get prometheus and alertmanager pods prometheus_alertmanager_pods = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Get all pvc on monitoring namespace pvc_objs_list = pvc.get_all_pvc_objs( namespace=constants.MONITORING_NAMESPACE) # Delete configmap ocp_configmap.delete(resource_name='cluster-monitoring-config') # Delete all pvcs on monitoring namespace pvc.delete_pvcs(pvc_objs=pvc_objs_list) # Check all the prometheus and alertmanager pods are up for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Create configmap ocp_configmap.create(yaml_file=dir_configmap) # Check all the PVCs are up for pvc_obj in pvc_objs_list: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) # Check all the prometheus and alertmanager pods are up # and pvc are mounted on monitoring pods for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) mount_point = pod_obj.exec_cmd_on_pod( command="df -kh", out_yaml_format=False, ) assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}" log.info("Verified all pvc are mounted on monitoring pods") # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") @pytest.mark.polarion_id("OCS-1535") def test_monitoring_shutdown_mgr_pod(self, pods): """ Montoring backed by OCS, bring mgr down(replica: 0) for some time and check ceph related metrics """ # Check ceph metrics available assert check_ceph_metrics_available(), ( "failed to get results for some metrics before Downscaling deployment mgr to 0" ) # Get pod mge name and mgr deployment oc_deployment = ocp.OCP(kind=constants.DEPLOYMENT, namespace=ROOK_CLUSTER_NAMESPACE) mgr_deployments = oc_deployment.get( selector=constants.MGR_APP_LABEL)['items'] mgr = mgr_deployments[0]['metadata']['name'] pod_mgr_name = get_pod_name_by_pattern( pattern=mgr, namespace=ROOK_CLUSTER_NAMESPACE) log.info(f"Downscaling deployment {mgr} to 0") oc_deployment.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}") log.info(f"Wait for a mgr pod {pod_mgr_name[0]} to be deleted") oc_pod = ocp.OCP(kind=constants.POD, namespace=ROOK_CLUSTER_NAMESPACE) oc_pod.wait_for_delete(resource_name=pod_mgr_name[0]) log.info(f"Upscaling deployment {mgr} back to 1") oc_deployment.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}") log.info("Waiting for mgr pod to be reach Running state") oc_pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL) # Check ceph metrics available assert check_ceph_metrics_available(), ( "failed to get results for some metrics after Downscaling and Upscaling deployment mgr" )
class TestRegistryPodRespin(E2ETest): """ Test to run svt workload for pushing images to registry and with Ceph pods respin """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def setup(self, request): """ Setup and clean up the namespace """ self.project_name = 'test' ocp_obj = ocp.OCP(kind=constants.NAMESPACES) ocp_obj.new_project(project_name=self.project_name) def finalizer(): log.info("Clean up and remove namespace") ocp_obj.exec_oc_cmd(command=f'delete project {self.project_name}') # Reset namespace to default ocp.switch_to_default_rook_cluster_project() ocp_obj.wait_for_delete(resource_name=self.project_name) request.addfinalizer(finalizer) @pytest.mark.parametrize( argnames=["pod_name"], argvalues=[ pytest.param(*['mon'], marks=pytest.mark.polarion_id("OCS-1797")), pytest.param(*['osd'], marks=pytest.mark.polarion_id("OCS-1798")), pytest.param(*['mgr'], marks=pytest.mark.polarion_id("OCS-1799")), pytest.param(*['mds'], marks=pytest.mark.polarion_id("OCS-1790")) ]) def test_registry_respin_pod(self, pod_name): """ Test registry workload when backed by OCS respin of ceph pods """ # Respin relevant pod log.info(f"Respin Ceph pod {pod_name}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=f'{pod_name}') disruption.delete_resource() # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template='eap-cd-basic-s2i', image= 'registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest', pattern='eap-app') # Validate image exists in registries path validate_image_exists(namespace=self.project_name) # Validate image registry pods validate_registry_pod_status() # Validate cluster health ok and all pods are running self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) # Unscheduling node node.unschedule_nodes([osd_node_name]) # Draining Node node.drain_nodes([osd_node_name]) log.info("Getting machine name from specified node name") machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"Node {osd_node_name} associated machine is {machine_name}") log.info( f"Deleting machine {machine_name} and waiting for new machine to come up" ) machine.delete_machine_and_check_state_of_new_spinned_machine( machine_name) new_machine_list = machine.get_machines() for machines in new_machine_list: # Trimming is done to get just machine name # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b if re.match(machines.name[:-6], machine_name): new_machine_name = machines.name machineset_name = machine.get_machineset_from_machine_name( new_machine_name) log.info("Waiting for new worker node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) new_node_name = node.get_node_from_machine_name(new_machine_name) log.info("Adding ocs label to newly created worker node") node_obj = ocp.OCP(kind='node') node_obj.add_label(resource_name=new_node_name, label=constants.OPERATOR_NODE_LABEL) log.info( f"Successfully labeled {new_node_name} with OCS storage label") # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check()
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def health_checker(self): """ Check Ceph health """ try: status = ceph_health_check_base() if status: log.info("Health check passed") except CephHealthException as e: # skip because ceph is not in good health pytest.skip(str(e)) @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ]) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=[ pytest.mark.polarion_id("OCS-1293"), bugzilla('1754287') ]) ]) def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier3 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ]) def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: log.info( f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_typed_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2128")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2129")), ]) def test_simultaneous_drain_of_two_ocs_nodes(self, pvc_factory, pod_factory, dc_pod_factory, interface): """ OCS-2128/OCS-2129: - Create PVCs and start IO on DC based app pods - Add one extra node in two of the AZs and label the nodes with OCS storage label - Maintenance (mark as unscheduable and drain) 2 worker nodes simultaneously - Confirm that OCS and DC pods are in running state - Remove unscheduled nodes - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Check cluster and Ceph health """ # Get OSD running nodes osd_running_worker_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_worker_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_worker_nodes, label_key='dc', label_value='fedora') log.info("Successfully labeled worker nodes with {dc:fedora}") # Create DC app pods log.info("Creating DC based app pods and starting IO in background") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get the machine name using the node name machine_names = [ machine.get_machine_from_node_name(osd_running_worker_node) for osd_running_worker_node in osd_running_worker_nodes[:2] ] log.info(f"{osd_running_worker_nodes} associated " f"machine are {machine_names}") # Get the machineset name using machine name machineset_names = [ machine.get_machineset_from_machine_name(machine_name) for machine_name in machine_names ] log.info(f"{osd_running_worker_nodes} associated machineset " f"is {machineset_names}") # Add a new node and label it add_new_node_and_label_it(machineset_names[0]) add_new_node_and_label_it(machineset_names[1]) # Drain 2 nodes drain_nodes(osd_running_worker_nodes[:2]) # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # DC app pods on the drained node will get automatically created on other # running node in same AZ. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Remove unscheduled nodes # In scenarios where the drain is attempted on >3 worker setup, # post completion of drain we are removing the unscheduled nodes so # that we maintain 3 worker nodes. log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}") remove_node_objs = get_node_objs(osd_running_worker_nodes[:2]) remove_nodes(remove_node_objs) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestDetachAttachWorkerVolumeAWS(ManageTest): """ Test class for detach and attach worker volume """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.polarion_id("OCS-1085") def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1) assert worker, "Failed to find a worker node for the test" worker = worker[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) assert instance, f"Failed to get ec2 instances for node {worker.name}" instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data volume from 2 of the worker nodes - Attach back the volume to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Requesting 2 worker nodes for the test as this case includes # detach and attach of data volume of 1 worker node workers = node.get_typed_nodes(num_of_nodes=2) assert workers, "Failed to find worker nodes for the test" # Get the worker nodes ec2 instance IDs and names instances = aws.get_instances_ids_and_names(workers) assert instances, ( f"Failed to get ec2 instances for node {[w.name for w in workers]}" ) for instance in instances.items(): instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instances so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instances, wait=True) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestWhenOneOfThePrometheusNodeDown(E2ETest): """ When the nodes are down, there should not be any functional impact on monitoring pods. All the data/metrics should be collected correctly. """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @workloads def test_monitoring_when_one_of_the_prometheus_node_down( self, test_fixture): """ Test case to validate when the prometheus pod is down and interaction with prometheus """ namespace_list, pvc_objs, pod_objs, sc = test_fixture aws_obj = aws.AWS() # Get all the openshift-monitoring pods monitoring_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) # Get the worker node list workers = get_typed_nodes(node_type='worker') # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] prometheus_node = [ node for node in workers if node.get().get('metadata').get('name') == prometheus_node ] # Make one of the node down where the prometheus pod is hosted instances = aws.get_instances_ids_and_names(prometheus_node) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the monitoring pods are up for pod_obj in monitoring_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) # Check for the created pvc metrics after nodes restarting for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after restarting nodes namespaces = helpers.create_multilpe_projects(number_of_project=1) namespace_list.extend(namespaces) # Create pvcs after restarting nodes pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after restarting nodes pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod after restarting nodes for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )
class TestAMQNodeReboot(E2ETest): """ Test case to reboot or shutdown and recovery node when amq workload is running """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady for situations in which the test failed in between """ def finalizer(): # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") request.addfinalizer(finalizer) @pytest.fixture() def amq_setup(self, amq_factory_fixture): """ Creates amq cluster and run benchmarks """ sc_name = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) self.amq, self.threads = amq_factory_fixture(sc_name=sc_name.name) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1282")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1281")) ]) def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup): """ Test case to validate rebooting master node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_typed_nodes(node_type, num_of_nodes=1) # Reboot one master nodes nodes.restart_nodes(node, wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15)(ocp.wait_for_cluster_connectivity(tries=400)) retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15)(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition='Running', resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800) @pytest.mark.polarion_id("OCS-1278") def test_amq_after_shutdown_and_recovery_worker_node( self, nodes, amq_setup): """ Test case to validate shutdown and recovery node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_typed_nodes(node_type='worker', num_of_nodes=1) # Reboot one master nodes nodes.stop_nodes(nodes=node) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=node) # Validate all nodes are in READY state and up retry((CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=30, delay=15)(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition='Running', resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes - Reactive """ threads = [] @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param(*['rbd', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2102"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['rbd', 'terminate'], marks=pytest.mark.polarion_id("OCS-2103")), pytest.param(*['cephfs', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2104"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['cephfs', 'terminate'], marks=pytest.mark.polarion_id("OCS-2105")), ]) def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesRestart(ManageTest): """ Test ungraceful cluster shutdown """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Make sure all nodes are up again """ def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() request.addfinalizer(finalizer) @pytest.mark.parametrize( argnames=["force"], argvalues=[ pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")), pytest.param(*[False], marks=[ pytest.mark.polarion_id("OCS-895"), aws_platform_required ]) ]) def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force): """ Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs) """ ocp_nodes = get_node_objs() nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla('1754287') @pytest.mark.polarion_id("OCS-2015") def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_node_objs() for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.sanity_helpers.create_resources(pvc_factory, pod_factory) @pytest.mark.parametrize( argnames=["interface", "operation"], argvalues=[ pytest.param(*['rbd', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1138")), pytest.param(*['rbd', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1241")), pytest.param(*['cephfs', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1139")), pytest.param(*['cephfs', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1242")) ]) def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node( self, nodes, pvc_factory, pod_factory, interface, operation): """ Test PV provisioning under degraded state - stop the node that has the provisioner pod running on OCS-1138: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1241: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1139: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1242: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) provisioner_pods = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pods = pod.get_rbdfsplugin_provisioner_pods() elif interface == 'cephfs': provisioner_pods = pod.get_cephfsplugin_provisioner_pods() provisioner_pod = provisioner_pods[0] # Making sure that the node is not running the rook operator pod: provisioner_node = pod.get_pod_node(provisioner_pod) rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get( 'name') == provisioner_node.get().get('metadata').get('name'): provisioner_pod = provisioner_pods[1] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get().get('metadata').get( 'name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Stopping the nodes nodes.stop_nodes(nodes=[provisioner_node]) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=600, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" logger.info( f"Pod {provisioner_pod_name} has reached status Terminating") # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for {interface} provisioner pod to reach status Running") # After this change https://github.com/rook/rook/pull/3642/, there are # 2 provisioners for each interface assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=2 ), f"{interface} provisioner pod failed to reach status Running" logger.info(f"{interface} provisioner pod has reached status Running") if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[provisioner_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check() @pytest.mark.parametrize( argnames=["operation"], argvalues=[ pytest.param(*['create_resources'], marks=[pytest.mark.polarion_id("OCS-2016")]), pytest.param(*['delete_resources'], marks=[pytest.mark.polarion_id("OCS-2017")]), ]) def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node( self, nodes, pvc_factory, pod_factory, operation): """ Test PV provisioning under degraded state - stop the node that has the rook operator pod running on OCS-2016: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-2017: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources - Start the worker node - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) rook_operator_pods = pod.get_operator_pods() rook_operator_pod = rook_operator_pods[0] rook_operator_pod_name = rook_operator_pod.name logger.info(f"rook operator pod found: {rook_operator_pod_name}") # Get the node name that has the rook operator pod running on operator_node = pod.get_pod_node(rook_operator_pod) operator_node_name = operator_node.get().get('metadata').get('name') logger.info( f"{rook_operator_pod_name} pod is running on node {operator_node_name}" ) # Stopping the node nodes.stop_nodes(nodes=[operator_node]) # Wait for the rook operator pod to get to running status selector = constants.OPERATOR_LABEL # Wait for the rook operator pod to reach Terminating status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Terminating" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, resource_name=rook_operator_pod_name, condition=constants.STATUS_TERMINATING ), "rook operator pod failed to reach status Terminating" logger.info( f"Pod {rook_operator_pod_name} has reached status Terminating") # Wait for the rook operator pod to be started and reach running status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Running" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=1 ), "rook operator pod failed to reach status Running" logger.info("rook operator pod has reached status Running") assert wait_for_ct_pod_recovery( ), "Ceph tools pod failed to come up on another node" if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[operator_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM: if config.ENV_DATA['deployment_type'] == 'ipi': node.delete_and_create_osd_node_aws_ipi(osd_node_name) elif config.ENV_DATA['deployment_type'] == 'upi': node.delete_and_create_osd_node_aws_upi(osd_node_name) else: pytest.fail( f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, " f"results of this test run are all invalid.") elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=30)
class TestPgSQLNodeReboot(E2ETest): """ Test running PGSQL and with Ceph pods respin """ @pytest.fixture() def pgsql_setup(self, pgsql): """ PGSQL test setup """ # Deployment of postgres database pgsql.setup_postgresql(replicas=3) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.usefixtures(pgsql_setup.__name__) def test_run_pgsql_node_drain(self, pgsql, transactions=900, node_type='master'): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) # Node drain with specific node type typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)