class TestDiskFailures(ManageTest): """ Test class for detach and attach worker volume """ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node): """ Detach an EBS volume from an AWS instance and wait for the volume to be re-attached Args: node (OCS): The OCS object representing the node data_volume (Volume): The ec2 volume to delete worker_node (OCS): The OCS object of the EC2 instance """ try: # Detach volume (logging is done inside the function) nodes.detach_volume(data_volume, worker_node) except AWSTimeoutException as e: if "Volume state: in-use" in e: logger.info( f"Volume {data_volume} re-attached successfully to worker" f" node {worker_node}") else: raise else: """ Wait for worker volume to be re-attached automatically to the node """ assert nodes.wait_for_volume_attach(data_volume), ( f"Volume {data_volume} failed to be re-attached to worker " f"node {worker_node}") @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady, for situations in which the test failed before restarting the node after detach volume, which leaves nodes in NotReady """ def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == constants.STATUS_CLBO): node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name]) # Verify OSD encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @cloud_platform_required @pytest.mark.polarion_id("OCS-1085") @bugzilla("1825675") def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Wait for the volumes to be re-attached back to the worker node - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Restart the node so the volume will get re-mounted """ # Get a data volume data_volume = nodes.get_data_volumes()[0] # Get the worker node according to the volume attachment worker = nodes.get_node_by_attached_volume(data_volume) # Detach volume and wait for the volume to attach self.detach_volume_and_wait_for_attach(nodes, data_volume, worker) # Validate cluster is still functional # In case the selected node that its volume disk was detached was the one # running the ceph tools pod, we'll need to wait for a new ct pod to start. # For that, a function that connects to the ct pod is being used to check if # it's alive assert (wait_for_ct_pod_recovery() ), "Ceph tools pod failed to come up on another node" self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) # Restart the instance so the volume will get re-mounted nodes.restart_nodes([worker]) # Cluster health check # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster # becomes healthy eventually # TODO: Remove 'tries=100' self.sanity_helpers.health_check(tries=100) @cloud_platform_required @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory): """ Detach and attach disk from 2 worker nodes - Detach the data 2 of the data volumes from their worker nodes - Wait for the volumes to be re-attached back to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Get 2 data volumes data_volumes = nodes.get_data_volumes()[:2] workers_and_volumes = [{ "worker": nodes.get_node_by_attached_volume(vol), "volume": vol } for vol in data_volumes] for worker_and_volume in workers_and_volumes: # Detach volume and wait for the volume to attach self.detach_volume_and_wait_for_attach(nodes, worker_and_volume["volume"], worker_and_volume["worker"]) # Restart the instances so the volume will get re-mounted nodes.restart_nodes([ worker_and_volume["worker"] for worker_and_volume in workers_and_volumes ]) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) @bugzilla("1830702") @vsphere_platform_required @pytest.mark.polarion_id("OCS-2172") def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = get_osd_pod_id(osd_pod) # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) ocp_obj.exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job osd_removal_job = run_osd_removal_job(osd_id) assert osd_removal_job, "ocs-osd-removal failed to create" is_completed = verify_osd_removal_job_completed_successfully(osd_id) assert is_completed, "ocs-osd-removal-job is not in status 'completed'" logger.info("ocs-osd-removal-job completed successfully") osd_pvc_name = osd_pvc.name ocp_version = get_ocp_version() if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment # We just need to verify the old PV is in the expected status logger.info( f"Verify that the old PV '{osd_pv_name}' is in the expected status" ) if cluster.is_lso_cluster(): expected_old_pv_statuses = [constants.STATUS_RELEASED] else: expected_old_pv_statuses = [ constants.STATUS_RELEASED, constants.STATUS_FAILED, ] assert (osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses), logger.warning( f"The old PV '{osd_pv_name}' is not in " f"the expected statuses: {expected_old_pv_statuses}") # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # If we use LSO, we need to create and attach a new disk manually if cluster.is_lso_cluster(): node.add_disk_to_node(osd_node) if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") is_deleted = delete_osd_removal_job(osd_id) assert is_deleted, "Failed to delete ocs-osd-removal-job" logger.info("ocs-osd-removal-job deleted successfully") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if Version.coerce(ocp_version) >= Version.coerce("4.6"): silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") # Validate cluster is still functional self.sanity_helpers.health_check(tries=120) self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory)
class TestRestartMgrWhileTwoMonsDown(ManageTest): """ Restart mgr pod while two mon pods are down """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(scope="function", autouse=True) def teardown(self, request): """ Verify all pods on openshift-storage project on Running state """ def finalizer(): for mon_scale in self.mons_scale: self.oc.exec_oc_cmd( f"scale --replicas=1 deployment/{mon_scale}") wait_for_pods_to_be_running(timeout=600) request.addfinalizer(finalizer) def test_restart_mgr_while_two_mons_down(self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory): """ Test Procedure: 1.Scaling down two mons: oc scale --replicas=0 deploy/rook-ceph-mon-a oc scale --replicas=0 deploy/rook-ceph-mon-b 2.Restarting mgr oc delete pod -l app=rook-ceph-mgr 3.sleep 5 seconds 4.Scaling mons back up oc scale --replicas=1 deploy/rook-ceph-mon-a oc scale --replicas=1 deploy/rook-ceph-mon-b 5.sleep 10 6.Waiting for mgr pod move to running state: oc get pod -l app=rook-ceph-mgr """ self.oc = ocp.OCP(kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]) mons = [ mon["metadata"]["name"] for mon in get_deployments_having_label( constants.MON_APP_LABEL, defaults.ROOK_CLUSTER_NAMESPACE) ] self.mons_scale = mons[0:2] tries = 11 for index in range(1, tries): log.info(f"Scaling down two mons {self.mons_scale}, index={index}") for mon_scale in self.mons_scale: self.oc.exec_oc_cmd( f"scale --replicas=0 deployment/{mon_scale}") log.info(f"Restarting mgr pod, index={index}") mgr_pod = get_mgr_pods() mgr_pod[0].delete(wait=True) time.sleep(5) log.info(f"Scaling up two mons {self.mons_scale}, index={index}") for mon_scale in self.mons_scale: self.oc.exec_oc_cmd( f"scale --replicas=1 deployment/{mon_scale}") time.sleep(10) log.info( f"Waiting for mgr pod move to Running state, index={index}") mgr_pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert mgr_pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, resource_count=1, timeout=100, ), f"Mgr pod did'nt move to Running state after 100 seconds, index={index}" log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) logging.info("Deleting Resources using sanity helpers") self.sanity_helpers.delete_resources()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes - Reactive """ threads = [] @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param( *["rbd", "shutdown"], marks=[ pytest.mark.polarion_id("OCS-2102"), pytest.mark.bugzilla("1845666"), ], ), pytest.param( *["rbd", "terminate"], marks=pytest.mark.polarion_id("OCS-2103") ), pytest.param( *["cephfs", "shutdown"], marks=[ pytest.mark.polarion_id("OCS-2104"), pytest.mark.bugzilla("1845666"), ], ), pytest.param( *["cephfs", "terminate"], marks=pytest.mark.polarion_id("OCS-2105") ), ], ) def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface ): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora") # Create DC app pods log.info("Creating DC based app pods") if interface == "rbd": interface = constants.CEPHBLOCKPOOL elif interface == "cephfs": interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name ) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name(machine_name) log.info(f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info( f"Successfully terminated node : " f"{failure_node_obj[0].name} instance" ) try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info( f"Successfully terminated node : " f"{failure_node_obj[0].name} instance" ) raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestRGWAndNoobaaDBHostNodeFailure(ManageTest): """ Test to verify fail node hosting RGW pods and Noobaa-db pods and its impact """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def create_obc_creation(self, bucket_factory, mcg_obj, key): """""" # Create a bucket then read & write bucket_name = bucket_factory(amount=1, interface="OC")[0].name obj_data = "A random string data" assert s3_put_object( mcg_obj, bucket_name, key, obj_data ), f"Failed: Put object, {key}" assert s3_get_object(mcg_obj, bucket_name, key), f"Failed: Get object, {key}" def test_rgw_host_node_failure( self, nodes, node_restart_teardown, mcg_obj, bucket_factory ): """ Test case to fail node where RGW and Noobaa-db-0 hosting and verify new pod spuns on healthy node """ # Get rgw pods rgw_pod_obj = get_rgw_pods() # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name == "noobaa-db-0": noobaa_pod_node = get_pod_node(noobaa_pod) for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info( f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted" ) node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720 ) # Validate new rgw pod spun ocp_obj = OCP( kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1") # Start the node nodes.start_nodes(node_obj) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check() # Verify all storage pods are running wait_for_storage_pods()
def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity()
class TestNodesRestartMS(ManageTest): """ Test nodes restart scenarios when using managed service """ @pytest.fixture(autouse=True) def setup(self, create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers): """ Initialize Sanity instance, and create pods and PVCs factory """ self.orig_index = config.cur_index self.sanity_helpers = Sanity() self.create_pods_and_pvcs_factory = ( create_scale_pods_and_pvcs_using_kube_job_on_ms_consumers) def create_resources(self): """ Create resources on the consumers and run IO """ if is_ms_consumer_cluster(): consumer_indexes = [config.cur_index] else: consumer_indexes = config.get_consumer_indexes_list() self.create_pods_and_pvcs_factory(consumer_indexes=consumer_indexes) @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Make sure all nodes are up again """ def finalizer(): ocp_nodes = get_node_objs() for n in ocp_nodes: recover_node_to_ready_state(n) logger.info("Switch to the original cluster index") config.switch_ctx(self.orig_index) ceph_health_check() request.addfinalizer(finalizer) @tier4a @pytest.mark.polarion_id("OCS-3980") def test_osd_node_restart_and_check_osd_pods_status(self, nodes): """ 1) Restart one of the osd nodes. 2) Check that the osd pods associated with the node should change to a Terminating state. 3) Wait for the node to reach Ready state. 4) Check that the new osd pods with the same ids start on the same node. 5) Check the worker nodes security groups. """ # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162 if is_ms_consumer_cluster(): logger.info( "The test is applicable only for an MS provider cluster. " "Switching to the provider cluster...") config.switch_to_provider() self.create_resources() osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] old_osd_pod_ids = get_node_osd_ids(osd_node_name) logger.info(f"osd pod ids: {old_osd_pod_ids}") node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids) node_osd_pod_names = [p.name for p in node_osd_pods] logger.info(f"Going to restart the node {osd_node_name}") nodes.restart_nodes(nodes=[osd_node], wait=False) logger.info("Verify the node osd pods go into a Terminating state") res = pod.wait_for_pods_to_be_in_statuses( [constants.STATUS_TERMINATING], node_osd_pod_names) assert res, "Not all the node osd pods are in a Terminating state" wait_for_nodes_status(node_names=[osd_node_name]) assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids, timeout=300) logger.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" ) logger.info( "Verify the worker nodes security groups on the provider...") assert verify_worker_nodes_security_groups() @tier4a @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*[constants.WORKER_MACHINE], marks=pytest.mark.polarion_id("OCS-3982")), pytest.param(*[constants.MASTER_MACHINE], marks=pytest.mark.polarion_id("OCS-3981")), ], ) def test_nodes_restart(self, nodes, node_type): """ Test nodes restart (from the platform layer) """ node_count = len(get_nodes(node_type=node_type)) if node_type == constants.WORKER_MACHINE: ocp_nodes = get_nodes(node_type=node_type) else: ocp_nodes = get_nodes(node_type=node_type, num_of_nodes=2) nodes.restart_nodes(nodes=ocp_nodes, wait=False) wait_for_node_count_to_reach_status(node_count=node_count, node_type=node_type) self.sanity_helpers.health_check() self.create_resources() @tier4b @bugzilla("1754287") @pytest.mark.polarion_id("OCS-2015") @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(constants.WORKER_MACHINE), pytest.param(constants.MASTER_MACHINE), ], ) def test_rolling_nodes_restart(self, nodes, node_type): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_nodes(node_type=node_type) for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.create_resources()
class TestNodesRestart(ManageTest): """ Test ungraceful cluster shutdown """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Make sure all nodes are up again """ def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() request.addfinalizer(finalizer) @pytest.mark.parametrize( argnames=["force"], argvalues=[ pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")), pytest.param( *[False], marks=[ pytest.mark.polarion_id("OCS-895"), aws_platform_required ], ), ], ) def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force): """ Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs) """ ocp_nodes = get_node_objs() nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla("1754287") @pytest.mark.polarion_id("OCS-2015") def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_node_objs() for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.sanity_helpers.create_resources(pvc_factory, pod_factory) @pytest.mark.parametrize( argnames=["interface", "operation"], argvalues=[ pytest.param(*["rbd", "create_resources"], marks=pytest.mark.polarion_id("OCS-1138")), pytest.param(*["rbd", "delete_resources"], marks=pytest.mark.polarion_id("OCS-1241")), pytest.param( *["cephfs", "create_resources"], marks=pytest.mark.polarion_id("OCS-1139"), ), pytest.param( *["cephfs", "delete_resources"], marks=pytest.mark.polarion_id("OCS-1242"), ), ], ) def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node( self, nodes, pvc_factory, pod_factory, interface, operation): """ Test PV provisioning under degraded state - stop the node that has the provisioner pod running on OCS-1138: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1241: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1139: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1242: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health """ if operation == "delete_resources": # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) provisioner_pods = None # Get the provisioner pod according to the interface if interface == "rbd": provisioner_pods = pod.get_rbdfsplugin_provisioner_pods() elif interface == "cephfs": provisioner_pods = pod.get_cephfsplugin_provisioner_pods() provisioner_pod = provisioner_pods[0] # Making sure that the node is not running the rook operator pod: provisioner_node = pod.get_pod_node(provisioner_pod) rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get("metadata").get( "name") == provisioner_node.get().get("metadata").get("name"): provisioner_pod = provisioner_pods[1] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get().get("metadata").get( "name") logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Stopping the nodes nodes.stop_nodes(nodes=[provisioner_node]) # Wait for the provisioner pod to get to running status selector = (constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if (interface == "rbd") else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL) # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=600, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING, ), f"{interface} provisioner pod failed to reach status Terminating" logger.info( f"Pod {provisioner_pod_name} has reached status Terminating") # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for {interface} provisioner pod to reach status Running") # After this change https://github.com/rook/rook/pull/3642/, there are # 2 provisioners for each interface assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=2, ), f"{interface} provisioner pod failed to reach status Running" logger.info(f"{interface} provisioner pod has reached status Running") if operation == "create_resources": # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == "delete_resources": # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[provisioner_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check() @pytest.mark.parametrize( argnames=["operation"], argvalues=[ pytest.param(*["create_resources"], marks=[pytest.mark.polarion_id("OCS-2016")]), pytest.param(*["delete_resources"], marks=[pytest.mark.polarion_id("OCS-2017")]), ], ) def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node( self, nodes, pvc_factory, pod_factory, operation): """ Test PV provisioning under degraded state - stop the node that has the rook operator pod running on OCS-2016: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-2017: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources - Start the worker node - Check cluster and Ceph health """ if operation == "delete_resources": # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) rook_operator_pods = pod.get_operator_pods() rook_operator_pod = rook_operator_pods[0] rook_operator_pod_name = rook_operator_pod.name logger.info(f"rook operator pod found: {rook_operator_pod_name}") # Get the node name that has the rook operator pod running on operator_node = pod.get_pod_node(rook_operator_pod) operator_node_name = operator_node.get().get("metadata").get("name") logger.info( f"{rook_operator_pod_name} pod is running on node {operator_node_name}" ) # Stopping the node nodes.stop_nodes(nodes=[operator_node]) # Wait for the rook operator pod to get to running status selector = constants.OPERATOR_LABEL # Wait for the rook operator pod to reach Terminating status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Terminating" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, resource_name=rook_operator_pod_name, condition=constants.STATUS_TERMINATING, ), "rook operator pod failed to reach status Terminating" logger.info( f"Pod {rook_operator_pod_name} has reached status Terminating") # Wait for the rook operator pod to be started and reach running status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Running" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=1, ), "rook operator pod failed to reach status Running" logger.info("rook operator pod has reached status Running") assert (wait_for_ct_pod_recovery() ), "Ceph tools pod failed to come up on another node" if operation == "create_resources": # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == "delete_resources": # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[operator_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check() @skipif_no_lso @bugzilla("1873938") @pytest.mark.polarion_id("OCS-2448") def test_pv_after_reboot_node(self, nodes): """ Verify unexpected PV is not created after node reboot on LSO cluster """ pv_before_reset = get_pv_names() worker_nodes = get_nodes(node_type=constants.WORKER_MACHINE, num_of_nodes=3) ocp_obj = OCP(kind=constants.PV) for worker_node in worker_nodes: # Restart one worker node nodes.restart_nodes(nodes=[worker_node], wait=True) self.sanity_helpers.health_check(cluster_check=False, tries=60) logger.info(f"Verify PV after reboot {worker_node}") pv_after_reset = get_pv_names() pv_diff = set(pv_after_reset) - set(pv_before_reset) pv_new = [] for pv in pv_diff: pv_obj = ocp_obj.get(resource_name=pv) if pv_obj["spec"]["storageClassName"] == "localblock": pv_new.append(pv) assert ( not pv_new ), f"Unexpected PV {pv_new} created after reboot {worker_node}" logger.info("SUCCESS - No new PV was created.")
class TestNodeReplacementWithIO(ManageTest): """ Knip-894 Node replacement proactive with IO """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive_with_io_running( self, pvc_factory, pod_factory, dc_pod_factory, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node Replacement proactive when IO running in the background """ # Get worker nodes worker_node_list = node.get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_node_name = select_osd_node_name() log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory( interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20 ) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20 ) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) delete_and_create_osd_node(osd_node_name) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources( pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory ) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120) # Verify OSD is encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()
def osd_device_replacement(nodes): """ Replacing randomly picked osd device Args: node (OCS): The OCS object representing the node """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = get_osd_pod_id(osd_pod) # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) ocp_version = get_ocp_version() if Version.coerce(ocp_version) < Version.coerce("4.6"): osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job osd_removal_job = run_osd_removal_job([osd_id]) assert osd_removal_job, "ocs-osd-removal failed to create" is_completed = verify_osd_removal_job_completed_successfully(osd_id) assert is_completed, "ocs-osd-removal-job is not in status 'completed'" logger.info("ocs-osd-removal-job completed successfully") osd_pvc_name = osd_pvc.name if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment # We just need to verify the old PV is in the expected status logger.info( f"Verify that the old PV '{osd_pv_name}' is in the expected status" ) if cluster.is_lso_cluster(): expected_old_pv_statuses = [constants.STATUS_RELEASED] else: expected_old_pv_statuses = [ constants.STATUS_RELEASED, constants.STATUS_FAILED, ] assert (osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses), logger.warning( f"The old PV '{osd_pv_name}' is not in " f"the expected statuses: {expected_old_pv_statuses}") # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # If we use LSO, we need to create and attach a new disk manually if cluster.is_lso_cluster(): node.add_disk_to_node(osd_node) if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") is_deleted = delete_osd_removal_job(osd_id) assert is_deleted, "Failed to delete ocs-osd-removal-job" logger.info("ocs-osd-removal-job deleted successfully") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if Version.coerce(ocp_version) >= Version.coerce("4.6"): silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") sanity_helpers = Sanity() sanity_helpers.health_check(tries=120)
class TestAMQPodRespin(E2ETest): """ Test running open messages on amq cluster when backed by rbd and with Ceph pods respin, amq pod respin """ @pytest.fixture() def amq_setup(self, amq_factory_fixture): """ Creates amq cluster and run benchmarks """ sc_name = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) self.amq, self.threads = amq_factory_fixture(sc_name=sc_name.name) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["pod_name"], argvalues=[ pytest.param(*["osd"], marks=pytest.mark.polarion_id("OCS-1276")), pytest.param(*["mon"], marks=pytest.mark.polarion_id("OCS-1275")), pytest.param(*["mgr"], marks=pytest.mark.polarion_id("OCS-2222")), pytest.param(*["rbdplugin"], marks=pytest.mark.polarion_id("OCS-1277")), pytest.param(*["rbdplugin_provisioner"], marks=pytest.mark.polarion_id("OCS-1283")), pytest.param(*["operator"], marks=pytest.mark.polarion_id("OCS-2223")), pytest.param(*["amq"], marks=pytest.mark.polarion_id("OCS-1280")), ], ) @pytest.mark.usefixtures(amq_setup.__name__) def test_run_amq_respin_pod(self, pod_name): """ Test amq workload when spinning ceph pods and restarting amq pods """ # Respin relevant pod if pod_name == "amq": pod_pattern_list = [ "cluster-operator", "my-cluster-kafka", "my-cluster-zookeeper", "my-connect-cluster-connect", "my-bridge-bridge", ] for pod_pattern in pod_pattern_list: respin_amq_app_pod(kafka_namespace=constants.AMQ_NAMESPACE, pod_pattern=pod_pattern) else: log.info(f"Respin Ceph pod {pod_name}") disruption = Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
class TestPgSQLNodeReboot(E2ETest): """ Test running PGSQL and with Ceph pods respin """ @pytest.fixture() def pgsql_setup(self, pgsql): """ PGSQL test setup """ # Deployment of postgres database pgsql.setup_postgresql(replicas=3) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["transactions", "pod_name"], argvalues=[ pytest.param(*[600, "osd"], marks=pytest.mark.polarion_id("OCS-801")), pytest.param(*[600, "postgres"], marks=pytest.mark.polarion_id("OCS-799")), ], ) @pytest.mark.usefixtures(pgsql_setup.__name__) def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Choose a node based on pod it contains if pod_name == "postgres": node_list = pgsql.get_pgsql_nodes() elif pod_name == "osd": node_list = get_osd_running_nodes() node_1 = get_node_objs(node_list[random.randint(0, len(node_list) - 1)]) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Restart relevant node nodes.restart_nodes(node_1) # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestHugePages(E2ETest): """ Enable huge pages post ODF installation """ @pytest.fixture(scope="function", autouse=True) def huge_pages_setup(self, request): """ Initializes sanity """ self.sanity_helpers = Sanity() def finalizer(): """ Removes huge pages on worker nodes and verifies all pods are up """ disable_huge_pages() wait_for_nodes_status(status=constants.NODE_READY, timeout=600) nodes = get_nodes() for node in nodes: assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] == "0"), f"Huge pages is not applied on {node.name}" log.info("Wait for all pods to be in running state") wait_for_pods_to_be_running(timeout=600) sanity_helpers.ceph_health_check(tries=120) request.addfinalizer(finalizer) def test_hugepages_post_odf_deployment( self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, node_restart_teardown, ): """ Test to verify that after enabling huge pages the nodes come up with higher page size and all odf cluster pods come back up. """ # Applies huge pages on the cluster nodes enable_huge_pages() log.info("Wait for all worker node to be READY state") wait_for_nodes_status(status=constants.NODE_READY, timeout=600) nodes = get_nodes() for node in nodes: assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] == "64Mi"), f"Huge pages is not applied on {node.name}" log.info("Wait for all storage cluster pods to be in running state") wait_for_pods_to_be_running(timeout=600) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, False) # Deleting Resources log.info("Deleting the resources created") self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120)
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*["rbd"], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*["cephfs"], marks=pytest.mark.polarion_id("OCS-2101")), ], ) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory, bucket_factory, rgw_bucket_factory, ): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora") # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == "rbd" else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert len(common_nodes) > 0, msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ if storagecluster_independent_check(): self.sanity_helpers = SanityExternalCluster() else: self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def health_checker(self): """ Check Ceph health """ try: status = ceph_health_check_base() if status: log.info("Health check passed") except CephHealthException as e: # skip because ceph is not in good health pytest.skip(str(e)) @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*["worker"], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*["master"], marks=pytest.mark.polarion_id("OCS-1272")), ], ) def test_node_maintenance( self, reduce_and_resume_cluster_load, node_type, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, ): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=90) @tier4 @tier4b @skipif_bm @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*["worker"], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*["master"], marks=pytest.mark.polarion_id("OCS-1293")), ], ) def test_node_maintenance_restart_activate( self, nodes, pvc_factory, pod_factory, node_type, bucket_factory, rgw_bucket_factory, ): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name reboot_events_cmd = ( f"get events -A --field-selector involvedObject.name=" f"{typed_node_name},reason=Rebooted -o yaml") # Find the number of reboot events in 'typed_node_name' num_events = len( typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"]) # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=False) try: wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_NOT_READY_SCHEDULING_DISABLED, ) except ResourceWrongStatusException: # Sometimes, the node will be back to running state quickly so # that the status change won't be detected. Verify the node was # actually restarted by checking the reboot events count new_num_events = len( typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"]) assert new_num_events > num_events, ( f"Reboot event not found." f"Node {typed_node_name} did not restart.") wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED, ) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() @tier3 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*["worker"], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*["master"], marks=pytest.mark.polarion_id("OCS-1271")), ], ) def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: log.info( f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ["worker", "master"] ] assert nodes, "Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_based_platform_required @ipi_deployment_required @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*["rbd"], marks=pytest.mark.polarion_id("OCS-2128")), pytest.param(*["cephfs"], marks=pytest.mark.polarion_id("OCS-2129")), ], ) def test_simultaneous_drain_of_two_ocs_nodes( self, pvc_factory, pod_factory, dc_pod_factory, interface, bucket_factory, rgw_bucket_factory, ): """ OCS-2128/OCS-2129: - Create PVCs and start IO on DC based app pods - Add one extra node in two of the AZs and label the nodes with OCS storage label - Maintenance (mark as unscheduable and drain) 2 worker nodes simultaneously - Confirm that OCS and DC pods are in running state - Remove unscheduled nodes - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Check cluster and Ceph health """ # Get OSD running nodes osd_running_worker_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_worker_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_worker_nodes, label_key="dc", label_value="fedora") log.info("Successfully labeled worker nodes with {dc:fedora}") # Create DC app pods log.info("Creating DC based app pods and starting IO in background") interface = (constants.CEPHBLOCKPOOL if interface == "rbd" else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get the machine name using the node name machine_names = [ machine.get_machine_from_node_name(osd_running_worker_node) for osd_running_worker_node in osd_running_worker_nodes[:2] ] log.info(f"{osd_running_worker_nodes} associated " f"machine are {machine_names}") # Get the machineset name using machine name machineset_names = [ machine.get_machineset_from_machine_name(machine_name) for machine_name in machine_names ] log.info(f"{osd_running_worker_nodes} associated machineset " f"is {machineset_names}") # Add a new node and label it add_new_node_and_label_it(machineset_names[0]) add_new_node_and_label_it(machineset_names[1]) # Drain 2 nodes drain_nodes(osd_running_worker_nodes[:2]) # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if "rook-ceph-crashcollector" in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = "-".join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # DC app pods on the drained node will get automatically created on other # running node in same AZ. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=1200) log.info("All the dc pods reached running state") # Remove unscheduled nodes # In scenarios where the drain is attempted on >3 worker setup, # post completion of drain we are removing the unscheduled nodes so # that we maintain 3 worker nodes. log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}") remove_node_objs = get_node_objs(osd_running_worker_nodes[:2]) remove_nodes(remove_node_objs) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @bugzilla("1861104") @bugzilla("1946573") @pytest.mark.polarion_id("OCS-2524") @tier4b def test_pdb_check_simultaneous_node_drains( self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, node_drain_teardown, ): """ - Check for OSD PDBs before drain - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs - Drain will be completed on worker node A - Drain will be pending on worker node B due to blocking PDBs - Check mon failover in first 10 mins, then 15 and 20 mins - Check the OSD PDBs - Mark the node A as schedulable - Let drain finish on Node B - Again check mon failover in first 10 mins and then in intervals - Mark the node B as schedulable - Check cluster and Ceph health """ # Validate OSD PDBs before drain operation assert (not validate_existence_of_blocking_pdb() ), "Blocking PDBs exist, Can't perform drain" # Get 2 worker nodes to drain typed_nodes = get_nodes(num_of_nodes=2) assert len( typed_nodes) == 2, "Failed to find worker nodes for the test" node_A = typed_nodes[0].name node_B = typed_nodes[1].name # Drain Node A and validate blocking PDBs drain_nodes([node_A]) assert (validate_existence_of_blocking_pdb() ), "Blocking PDBs not created post drain" # Inducing delay between 2 drains # Node-B drain expected to be in pending due to blocking PDBs time.sleep(30) try: drain_nodes([node_B]) # After the drain check Mon failover in 10th, 15th and 20th min timeout = [600, 300, 300] for failover in timeout: sample = TimeoutSampler( timeout=failover, sleep=10, func=helpers.check_number_of_mon_pods, ) if not sample.wait_for_func_status(result=True): assert "Number of mon pods not equal to expected_mon_count=3" except TimeoutExpired: # Mark the node-A back to schedulable and let drain finish in Node-B schedule_nodes([node_A]) time.sleep(40) # Validate OSD PDBs assert (validate_existence_of_blocking_pdb() ), "Blocking PDBs not created post second drain" # Mark the node-B back to schedulable and recover the cluster schedule_nodes([node_B]) sample = TimeoutSampler( timeout=100, sleep=10, func=validate_existence_of_blocking_pdb, ) if not sample.wait_for_func_status(result=False): log.error("Blocking PDBs still exist") # After the drain check mon failover in 10th, 15th and 20th Min timeout = [600, 300, 300] for failover in timeout: sample = TimeoutSampler( timeout=failover, sleep=10, func=helpers.check_number_of_mon_pods, ) if not sample.wait_for_func_status(result=True): assert "Number of Mon pods not equal to expected_mon_count=3" sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=1, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "The expected mon-pdb is not equal to actual mon pdb" # wait for storage pods pod.wait_for_storage_pods() # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=50) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources()
class TestPgSQLPodRespin(E2ETest): """ Test running PGSQL and with Ceph pods respin """ @pytest.fixture() def pgsql_setup(self, pgsql): """ PGSQL test setup """ # Deployment of postgres database pgsql.setup_postgresql(replicas=3) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["transactions", "pod_name"], argvalues=[ pytest.param(*[600, "mon"], marks=pytest.mark.polarion_id("OCS-802")), pytest.param(*[600, "osd"], marks=pytest.mark.polarion_id("OCS-803")), pytest.param(*[600, "mgr"], marks=pytest.mark.polarion_id("OCS-804")), pytest.param(*[600, "postgers"], marks=pytest.mark.polarion_id("OCS-809")), ], ) @pytest.mark.usefixtures(pgsql_setup.__name__) def test_run_pgsql_respin_pod(self, pgsql, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization(adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Respin relevant pod if pod_name == "postgers": pgsql.respin_pgsql_app_pod() else: log.info(f"Respin Ceph pod {pod_name}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Reactive """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") # Verify OSD encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param(*["rbd", "power off"], marks=pytest.mark.polarion_id("OCS-2118")), pytest.param(*["rbd", "network failure"], marks=pytest.mark.polarion_id("OCS-2120")), pytest.param(*["cephfs", "power off"], marks=pytest.mark.polarion_id("OCS-2119")), pytest.param( *["cephfs", "network failure"], marks=pytest.mark.polarion_id("OCS-2121"), ), ], ) def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora") # Create DC app pods log.info("Creating DC based app pods") if interface == "rbd": interface = constants.CEPHBLOCKPOOL elif interface == "cephfs": interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info( f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine(annotation=annotation, machine_name=machine_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes)) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind="node") node_obj.add_label(resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL) log.info( f"Successfully labeled {new_spun_node} with OCS storage label") # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=1200) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestRGWAndNoobaaDBHostNodeFailure(ManageTest): """ Test to verify fail node hosting RGW pods and Noobaa-db pods and its impact """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def create_obc_creation(self, bucket_factory, mcg_obj, key): # Create a bucket then read & write bucket_name = bucket_factory(amount=1, interface="OC", timeout=120)[0].name obj_data = "A random string data" assert s3_put_object(mcg_obj, bucket_name, key, obj_data), f"Failed: Put object, {key}" assert s3_get_object(mcg_obj, bucket_name, key), f"Failed: Get object, {key}" def test_rgw_host_node_failure(self, nodes, node_restart_teardown, node_drain_teardown, mcg_obj, bucket_factory): """ Test case to fail node where RGW and the NooBaa DB are hosted and verify the new pods spin on a healthy node """ # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted noobaa_pod_node = None for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name in [ constants.NB_DB_NAME_46_AND_BELOW, constants.NB_DB_NAME_47_AND_ABOVE, ]: noobaa_pod_node = get_pod_node(noobaa_pod) if noobaa_pod_node is None: assert False, "Could not find the NooBaa DB pod" # Validate if RGW pod and noobaa-db are hosted on same node # If not, make sure both pods are hosted on same node log.info("Validate if RGW pod and noobaa-db are hosted on same node") rgw_pod_obj = get_rgw_pods() rgw_pod_node_list = [ rgw_pod.get().get("spec").get("nodeName") for rgw_pod in rgw_pod_obj ] if not list( set(rgw_pod_node_list).intersection( noobaa_pod_node.name.split())): log.info("Unschedule other two nodes such that RGW " "pod moves to node where NooBaa DB pod hosted") worker_node_list = get_worker_nodes() node_names = list( set(worker_node_list) - set(noobaa_pod_node.name.split())) unschedule_nodes(node_names=node_names) ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) rgw_pod_obj[0].delete() ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, timeout=300, sleep=5, ) log.info("Schedule those nodes again") schedule_nodes(node_names=node_names) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Check again the rgw pod move to node where NooBaa DB pod hosted rgw_pod_obj_list = get_rgw_pods() rgw_pod_node_list = [ get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list ] value = [ True if rgw_pod_node == noobaa_pod_node.name else False for rgw_pod_node in rgw_pod_node_list ] assert value, ("RGW Pod didn't move to node where NooBaa DB pod" " hosted even after cordoned and uncordoned nodes" f"RGW pod hosted: {rgw_pod_node_list}" f"NooBaa DB pod hosted: {noobaa_pod_node.name}") log.info( "RGW and noobaa-db are hosted on same node start the test execution" ) rgw_pod_obj = get_rgw_pods() for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info(f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and NooBaa DB are hosted") node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state(resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720) # Validate new rgw pod spun ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Start the node nodes.start_nodes(node_obj) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check()
class FlowOperations: """ Flow based operations class """ def __init__(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def validate_cluster( self, cluster_check=False, node_status=False, pod_status=False, operation_name="", ): """ Validates various ceph and ocs cluster checks Args: node_status (bool): Verifies node is Ready pod_status (bool): Verifies StorageCluster pods in expected state operation_name (str): Name of the operation, to Tag """ logger.info(f"{operation_name}: Verifying cluster health") assert ceph_health_check( defaults.ROOK_CLUSTER_NAMESPACE, tries=100 ), "Entry criteria FAILED: Cluster is Unhealthy" if cluster_check: self.sanity_helpers.health_check(tries=100) if node_status: logger.info(f"{operation_name}: Verifying whether node is ready") wait_for_nodes_status(status=constants.NODE_READY, timeout=300) if pod_status: logger.info( f"{operation_name}: Verifying StorageCluster pods are in running/completed state" ) wait_for_storage_pods() def node_operations_entry_criteria( self, node_type, number_of_nodes, operation_name="Node Operation", network_fail_time=None, ): """ Entry criteria function for node related operations Args: node_type (str): Type of node number_of_nodes (int): Number of nodes operation_name (str): Name of the node operation network_fail_time (int): Total time to fail the network in a node Returns: tuple: containing the params used in Node operations """ self.validate_cluster(node_status=True, operation_name=operation_name) logger.info(f"Getting parameters related to: {operation_name}") typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=number_of_nodes) if network_fail_time: return typed_nodes, network_fail_time else: return typed_nodes def add_capacity_entry_criteria(self): """ Entry criteria verification function for add capacity operation Returns: tuple: containing the params used in add capacity exit operation """ self.validate_cluster(operation_name="Add Capacity") logger.info( "Add capacity: Getting restart count of pods before adding capacity" ) restart_count_before = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE ) logger.info("Add capacity entry: Getting OSD pod count before adding capacity") osd_pods_before = pod_helpers.get_osd_pods() return osd_pods_before, restart_count_before def add_capacity_exit_criteria(self, restart_count_before, osd_pods_before): """ Exit criteria function for Add capacity operation Args: restart_count_before (dict): Restart counts of pods osd_pods_before (list): List of OSD pods before """ self.validate_cluster(operation_name="Add Capacity") logger.info("Add capacity: Getting restart count of pods after adding capacity") restart_count_after = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE ) logger.info( f"Sum of restart count before = {sum(restart_count_before.values())}" ) logger.info(f"Sum of restart count after = {sum(restart_count_after.values())}") assert sum(restart_count_before.values()) == sum( restart_count_after.values() ), "Exit criteria verification FAILED: One or more pods got restarted" osd_pods_after = pod_helpers.get_osd_pods() number_of_osds_added = len(osd_pods_after) - len(osd_pods_before) logger.info( f"Number of OSDs added = {number_of_osds_added}, " f"before = {len(osd_pods_before)}, after = {len(osd_pods_after)}" ) assert ( number_of_osds_added == 3 ), "Exit criteria verification FAILED: osd count mismatch" logger.info("Add capacity: Exit criteria verification: Success")
class TestCouchBaseNodeReboot(E2ETest): """ Deploy an CouchBase workload using operator """ @pytest.fixture() def cb_setup(self, couchbase_new_factory_fixture, node_restart_teardown): """ Creates couchbase workload """ self.cb = couchbase_new_factory_fixture(replicas=3, run_in_bg=True, skip_analyze=True) # Initialize Sanity instance self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["pod_name_of_node"], argvalues=[ pytest.param(*["osd"], marks=pytest.mark.polarion_id("OCS-776")), pytest.param(*["master"], marks=pytest.mark.polarion_id("OCS-783")), pytest.param(*["couchbase"], marks=pytest.mark.polarion_id("OCS-776")), ], ) def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) get_node_resource_utilization_from_adm_top(node_type="master", print_table=True) if pod_name_of_node == "couchbase": node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == "osd": node_list = get_osd_running_nodes() elif pod_name_of_node == "master": master_node = get_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == "master": nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check(tries=40)
class TestOCSWorkerNodeShutdown(ManageTest): """ Test case validate both the MDS pods rbd and cephfs plugin Provisioner pods and not running on same node post shutdown and recovery """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.polarion_id("OCS-2315") def test_check_pod_status_after_two_nodes_shutdown_recovery( self, nodes, node_restart_teardown ): """ Test case to check MDS pods rbd and cephfs plugin Provisioner pods not running on same node post shutdown and recovery node """ # Get MDS, rbd, cephfs plugin provisioner pods running nodes # before shutdown log.info("Check pod nodes before nodes shutdown") list_of_nodes_running_pods(selector="rook-ceph-mds") list_of_nodes_running_pods(selector="csi-rbdplugin-provisioner") list_of_nodes_running_pods(selector="csi-cephfsplugin-provisioner") # Get the node list node = get_nodes(node_type="worker", num_of_nodes=2) # Shutdown 2 worker nodes for 10 mins nodes.stop_nodes(nodes=node) waiting_time = 600 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=node) # Validate all nodes are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=30, delay=15, )(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() wait_for_storage_pods() # Get MDS, rbd & cephfs plugin provisioner pods running # nodes post-recovery mds_running_nodes_after_recovery = list_of_nodes_running_pods( selector="rook-ceph-mds" ) rbd_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods( selector="csi-rbdplugin-provisioner" ) cephfs_provisioner_running_nodes_after_recovery = list_of_nodes_running_pods( selector="csi-cephfsplugin-provisioner" ) assert len(set(mds_running_nodes_after_recovery)) == len( mds_running_nodes_after_recovery ), "MDS running on same node, Not expected!!!" log.info("MDS pods not running on same node") assert len(set(rbd_provisioner_running_nodes_after_recovery)) == len( rbd_provisioner_running_nodes_after_recovery ), "rbd plugin provisioner pods running on Same node, Not expected" log.info("RBD plugin provisioner pods not running on same node") assert len(set(cephfs_provisioner_running_nodes_after_recovery)) == len( cephfs_provisioner_running_nodes_after_recovery ), "cephfs plugin provisioner pods running on Same node, Not expected" log.info("CEPHFS plugin provisioner pods not running on same node")
class TestAMQNodeReboot(E2ETest): """ Test case to reboot or shutdown and recovery node when amq workload is running """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady for situations in which the test failed in between """ def finalizer(): # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") request.addfinalizer(finalizer) @pytest.fixture() def amq_setup(self, amq_factory_fixture): """ Creates amq cluster and run benchmarks """ sc_name = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) self.amq, self.threads = amq_factory_fixture(sc_name=sc_name.name) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*["worker"], marks=pytest.mark.polarion_id("OCS-1282")), pytest.param(*["master"], marks=pytest.mark.polarion_id("OCS-1281")), ], ) def test_amq_after_rebooting_node(self, node_type, nodes, amq_setup): """ Test case to validate rebooting master node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_nodes(node_type, num_of_nodes=1) # Reboot one master nodes nodes.restart_nodes(node, wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition="Running", resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800) @pytest.mark.polarion_id("OCS-1278") def test_amq_after_shutdown_and_recovery_worker_node( self, nodes, amq_setup): """ Test case to validate shutdown and recovery node shouldn't effect amq workloads running in background """ # Get all amq pods pod_obj_list = get_all_pods(namespace=constants.AMQ_NAMESPACE) # Get the node list node = get_nodes(node_type="worker", num_of_nodes=1) # Reboot one master nodes nodes.stop_nodes(nodes=node) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=node) # Validate all nodes are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=30, delay=15, )(wait_for_nodes_status(timeout=1800)) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all amq pods are up and running assert POD.wait_for_resource(condition="Running", resource_count=len(pod_obj_list), timeout=300) # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
class TestNoobaaSTSHostNodeFailure(ManageTest): """ Test to verify NooBaa Statefulset pods recovers in case of a node failure """ labels_map = { constants.NOOBAA_CORE_STATEFULSET: constants.NOOBAA_CORE_POD_LABEL, constants.NOOBAA_DB_STATEFULSET: constants.NOOBAA_DB_LABEL_47_AND_ABOVE, constants.NOOBAA_OPERATOR_DEPLOYMENT: constants.NOOBAA_OPERATOR_POD_LABEL, } @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ if storagecluster_independent_check(): self.sanity_helpers = SanityExternalCluster() else: self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["noobaa_sts", "respin_noobaa_operator"], argvalues=[ pytest.param( *[constants.NOOBAA_CORE_STATEFULSET, False], marks=pytest.mark.polarion_id("OCS-2672"), ), pytest.param( *[constants.NOOBAA_DB_STATEFULSET, False], marks=pytest.mark.polarion_id("OCS-2668"), ), pytest.param( *[constants.NOOBAA_CORE_STATEFULSET, True], marks=pytest.mark.polarion_id("OCS-2669"), ), pytest.param( *[constants.NOOBAA_DB_STATEFULSET, True], marks=pytest.mark.polarion_id("OCS-2670"), ), ], ) def test_noobaa_sts_host_node_failure( self, noobaa_sts, respin_noobaa_operator, mcg_obj, bucket_factory, nodes, node_restart_teardown, ): """ Test case to fail node where NooBaa Statefulset pod (noobaa-core, noobaa-db) is hosted and verify the pod is rescheduled on a healthy node """ executor = ThreadPoolExecutor(max_workers=1) pod_obj = OCP( kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) # Get noobaa statefulset pod and node where it is hosted noobaa_sts_pod = get_noobaa_pods(noobaa_label=self.labels_map[noobaa_sts])[0] noobaa_sts_pod_node = get_pod_node(noobaa_sts_pod) log.info(f"{noobaa_sts_pod.name} is running on {noobaa_sts_pod_node.name}") # Get the NooBaa operator pod and node where it is hosted # Check if NooBaa operator and statefulset pod are hosted on same node noobaa_operator_pod = get_noobaa_pods( noobaa_label=self.labels_map[constants.NOOBAA_OPERATOR_DEPLOYMENT] )[0] noobaa_operator_pod_node = get_pod_node(noobaa_operator_pod) log.info( f"{noobaa_operator_pod.name} is running on {noobaa_operator_pod_node.name}" ) if noobaa_sts_pod_node.name == noobaa_operator_pod_node.name: operator_on_same_node = True log.info( f"{noobaa_sts_pod.name} and {noobaa_operator_pod.name} are running on same node." ) else: operator_on_same_node = False log.info( f"{noobaa_sts_pod.name} and {noobaa_operator_pod.name} are running on different node." ) # Stop the node log.info( f"Stopping {noobaa_sts_pod_node.name} where {noobaa_sts_pod.name} is hosted" ) stop_thread = executor.submit(nodes.stop_nodes, nodes=[noobaa_sts_pod_node]) node.wait_for_nodes_status( node_names=[noobaa_sts_pod_node.name], status=constants.NODE_NOT_READY ) # Disrupt NooBaa operator if respin_noobaa_operator: noobaa_operator_pod.delete(force=True) # Check result of 'stop_thread' stop_thread.result() # Wait for NooBaa operator pod to reach terminating state if on same node # and not respun if operator_on_same_node and not respin_noobaa_operator: wait_for_resource_state( resource=noobaa_operator_pod, state=constants.STATUS_TERMINATING, timeout=360, ) # Wait for NooBaa operator pod to reach running state pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=self.labels_map[constants.NOOBAA_OPERATOR_DEPLOYMENT], resource_count=1, ) # Verify NooBaa statefulset pod reschedules on another node try: for pod_list in TimeoutSampler( 60, 3, get_noobaa_pods, noobaa_label=self.labels_map[noobaa_sts], ): if len(pod_list) == 1: pod_node = get_pod_node(pod_list[0]) if pod_node.name != noobaa_sts_pod_node.name: log.info( f"{pod_list[0].name} has been rescheduled on {pod_node.name}" ) break log.info(f"Waiting for {noobaa_sts_pod.name} pod to be rescheduled") except TimeoutExpiredError: raise TimeoutExpiredError( f"{noobaa_sts_pod.name} pod not rescheduled within 60 seconds" ) # Wait for rescheduled pod to reach Running state. # For noobaa-db pod which is attached to a PV it may take more time (~8 minutes) # until the new pod can attach to the PV pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=self.labels_map[noobaa_sts], resource_count=1, timeout=800 if noobaa_sts == constants.NOOBAA_DB_STATEFULSET else 60, sleep=30 if noobaa_sts == constants.NOOBAA_DB_STATEFULSET else 3, ) # Start the node log.info( f"Starting {noobaa_sts_pod_node.name} where {noobaa_sts_pod.name} was hosted" ) nodes.start_nodes(nodes=[noobaa_sts_pod_node]) node.wait_for_nodes_status( node_names=[noobaa_sts_pod_node.name], status=constants.NODE_READY ) log.info("Wait for all pods to be in running state") wait_for_pods_to_be_running(timeout=300) # Check cluster health self.sanity_helpers.health_check() # Creates bucket then writes, reads and deletes objects # TODO: Reduce timeout in future versions once 2028559 is fixed self.sanity_helpers.obc_put_obj_create_delete( mcg_obj, bucket_factory, timeout=900 )
class TestNonOCSTaintAndTolerations(E2ETest): """ Test to test non ocs taints on ocs nodes and toleration """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request): """ Make sure all nodes are untainted """ def finalizer(): assert untaint_nodes( taint_label="xyz=true:NoSchedule", ), "Failed to untaint" request.addfinalizer(finalizer) def test_non_ocs_taint_and_tolerations(self): """ Test runs the following steps 1. Taint ocs nodes with non-ocs taint 2. Set tolerations on storagecluster, subscription and configmap 3. Respin all ocs pods and check if it runs on ocs nodes with tolerations 4. Add Capacity """ # Taint all nodes with non-ocs taint ocs_nodes = get_worker_nodes() taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule") # Add tolerations to the storagecluster storagecluster_obj = ocp.OCP( resource_name=constants.DEFAULT_CLUSTERNAME, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.STORAGECLUSTER, ) tolerations = ( '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",' '"operator": "Equal", "value": "true"}, ' '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", ' '"operator": "Equal", "value": "true"}]}') param = ( f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, ' f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}') storagecluster_obj.patch(params=param, format_type="merge") # Add tolerations to the subscription sub_list = ocp.get_all_resource_names_of_a_kind( kind=constants.SUBSCRIPTION) param = ( '{"spec": {"config": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}}') for sub in sub_list: sub_obj = ocp.OCP( resource_name=sub, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.SUBSCRIPTION, ) sub_obj.patch(params=param, format_type="merge") # Add tolerations to the configmap rook-ceph-operator-config configmap_obj = ocp.OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, resource_name=constants.ROOK_OPERATOR_CONFIGMAP, ) toleration = configmap_obj.get().get("data").get( "CSI_PLUGIN_TOLERATIONS") toleration += ( '\n- key: xyz\n operator: Equal\n value: "true"\n effect: NoSchedule' ) toleration = toleration.replace('"', '\\"').replace("\n", "\\n") param_cmd = ( f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, ' f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]' ) configmap_obj.patch(params=param_cmd, format_type="json") # After edit noticed few pod respins as expected assert wait_for_pods_to_be_running() # Respin all pods and check it if is still running # Excluding tool-box pod because of https://bugzilla.redhat.com/show_bug.cgi?id=2012084 pod_list = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=["rook-ceph-tools"], exclude_selector=True, ) for pod in pod_list: pod.delete(wait=False) assert wait_for_pods_to_be_running(timeout=600, sleep=15) self.sanity_helpers.health_check() # Add capacity to check if new osds has toleration osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = ocp.OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 assert pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=count * replica_count, ), "New OSDs failed to reach running state" check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)
class TestMonitoringBackedByOCS(E2ETest): """ Test cases to validate monitoring backed by OCS """ num_of_pvcs = 5 pvc_size = 5 @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady or unschedulable, for situations in which the test failed in between restarting or scheduling those nodes """ def finalizer(): # Validate all nodes are schedulable scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") assert prometheus_health_check(), "Prometheus health is degraded" request.addfinalizer(finalizer) @pytest.fixture() def pods(self, multi_pvc_factory, dc_pod_factory): """ Prepare multiple dc pods for the test Returns: list: Pod instances """ sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) pvc_objs = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, storageclass=sc, size=self.pvc_size, num_of_pvc=self.num_of_pvcs, ) pod_objs = [] for pvc_obj in pvc_objs: pod_objs.append(dc_pod_factory(pvc=pvc_obj)) # Check for the created pvc metrics on prometheus pod for pod_obj in pod_objs: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" return pod_objs @pytest.mark.polarion_id("OCS-576") def test_monitoring_after_restarting_prometheus_pod(self, pods): """ Test case to validate prometheus pod restart should not have any functional impact """ # Get the prometheus pod prometheus_pod_obj = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"] ) for pod_object in prometheus_pod_obj: # Get the pvc which mounted on prometheus pod pod_info = pod_object.get() pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] # Restart the prometheus pod pod_object.delete(force=True) pod_obj = ocp.OCP( kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE ) assert pod_obj.wait_for_resource( condition="Running", selector="app=prometheus", timeout=60 ) # Check the same pvc is mounted on new pod pod_info = pod_object.get() assert ( pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] in pvc_name ), f"Old pvc not found after restarting the prometheus pod {pod_object.name}" for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-579") def test_monitoring_after_draining_node_where_prometheus_hosted(self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"] ) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj["spec"]["nodeName"] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Validate all prometheus pod is running POD = ocp.OCP( kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE ) assert POD.wait_for_resource( condition="Running", selector="app=prometheus", timeout=180 ), "One or more prometheus pods are not in running state" # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info["spec"]["nodeName"] assert ( new_node not in prometheus_node ), "Promethues pod not re-spinned on new node" log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert ( pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] in pvc_name ), f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" # Validate the prometheus health is ok assert prometheus_health_check(), "Prometheus cluster health is not OK" # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-580") def test_monitoring_after_respinning_ceph_pods(self, pods): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ["mgr", "mon", "osd"] disruption = Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" # Validate osd is up and ceph health is ok self.sanity_helpers.health_check(tries=40) @pytest.mark.polarion_id("OCS-605") def test_monitoring_when_osd_down(self, pods): """ Test case to validate monitoring when osd is down """ # Get osd pods osd_pod_list = pod.get_osd_pods() # Make one of the osd down(first one) resource_name = osd_pod_list[0].get().get("metadata").get("name") assert modify_osd_replica_count(resource_name=resource_name, replica_count=0) # Validate osd is down pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_obj.wait_for_delete(resource_name=resource_name), ( f"Resources is not deleted {resource_name}" ) # Check for the created pvc metrics when osd is down for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" # Make osd up which was down assert modify_osd_replica_count(resource_name=resource_name, replica_count=1) # Validate osd is up and ceph health is ok self.sanity_helpers.health_check(tries=40) @pytest.mark.polarion_id("OCS-606") def test_monitoring_when_one_of_the_prometheus_node_down(self, nodes, pods): """ Test case to validate when the prometheus pod is down and its interaction with prometheus """ # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"] ) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted pod_node_obj = pod.get_pod_node(pod_obj) # Make one of the node down where the prometheus pod is hosted nodes.restart_nodes([pod_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)( wait_for_nodes_status() ) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check all the prometheus pods are up for pod_obj in pod_obj_list: wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" ) @pytest.mark.polarion_id("OCS-709") def test_monitoring_after_rebooting_master_node(self, nodes, pods): """ Test case to validate rebooting master node shouldn't delete the data collected on prometheus pod """ # Get the master node list master_nodes = get_nodes(node_type="master") # Reboot one after one master nodes for node in master_nodes: nodes.restart_nodes([node], wait=False) # Wait some time after rebooting master waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) wait_for_nodes_status_and_prometheus_health_check(pods) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) @pytest.mark.polarion_id("OCS-710") def test_monitoring_after_rebooting_node_where_mgr_is_running(self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)( wait_for_nodes_status() ) # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mgr", timeout=600 ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-711") @skipif_aws_i3 def test_monitoring_shutdown_and_recovery_prometheus_node(self, nodes, pods): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact """ # Get all prometheus pods prometheus_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"] ) for prometheus_pod_obj in prometheus_pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj) # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted nodes.stop_nodes([prometheus_node_obj]) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=[prometheus_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)( wait_for_nodes_status() ) # Check all the prometheus pods are up for pod_obj in prometheus_pod_obj_list: wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" @pytest.mark.polarion_id("OCS-638") def test_monitoring_delete_pvc(self): """ Test case to validate whether delete pvcs+configmap and recovery of a node where monitoring pods running has no functional impact """ # Get 'cluster-monitoring-config' configmap ocp_configmap = ocp.OCP( namespace=constants.MONITORING_NAMESPACE, kind="configmap" ) configmap_dict = ocp_configmap.get(resource_name="cluster-monitoring-config") dir_configmap = tempfile.mkdtemp(prefix="configmap_") yaml_file = f"{dir_configmap}/configmap.yaml" templating.dump_data_to_temp_yaml(configmap_dict, yaml_file) # Get prometheus and alertmanager pods prometheus_alertmanager_pods = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus", "alertmanager"], ) # Get all pvc on monitoring namespace pvc_objs_list = pvc.get_all_pvc_objs(namespace=constants.MONITORING_NAMESPACE) # Delete configmap ocp_configmap.delete(resource_name="cluster-monitoring-config") # Delete all pvcs on monitoring namespace pvc.delete_pvcs(pvc_objs=pvc_objs_list) # Check all the prometheus and alertmanager pods are up for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) # Create configmap ocp_configmap.create(yaml_file=dir_configmap) # Check all the PVCs are up for pvc_obj in pvc_objs_list: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180 ) # Check all the prometheus and alertmanager pods are up # and pvc are mounted on monitoring pods for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) mount_point = pod_obj.exec_cmd_on_pod( command="df -kh", out_yaml_format=False, ) assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}" log.info("Verified all pvc are mounted on monitoring pods") # Validate the prometheus health is ok assert prometheus_health_check(), "Prometheus cluster health is not OK" @pytest.mark.polarion_id("OCS-1535") def test_monitoring_shutdown_mgr_pod(self, pods): """ Montoring backed by OCS, bring mgr down(replica: 0) for some time and check ceph related metrics """ # Check ceph metrics available assert ( check_ceph_metrics_available() ), "failed to get results for some metrics before Downscaling deployment mgr to 0" # Get pod mge name and mgr deployment oc_deployment = ocp.OCP( kind=constants.DEPLOYMENT, namespace=ROOK_CLUSTER_NAMESPACE ) mgr_deployments = oc_deployment.get(selector=constants.MGR_APP_LABEL)["items"] mgr = mgr_deployments[0]["metadata"]["name"] pod_mgr_name = get_pod_name_by_pattern( pattern=mgr, namespace=ROOK_CLUSTER_NAMESPACE ) log.info(f"Downscaling deployment {mgr} to 0") oc_deployment.exec_oc_cmd(f"scale --replicas=0 deployment/{mgr}") log.info(f"Wait for a mgr pod {pod_mgr_name[0]} to be deleted") oc_pod = ocp.OCP(kind=constants.POD, namespace=ROOK_CLUSTER_NAMESPACE) oc_pod.wait_for_delete(resource_name=pod_mgr_name[0]) log.info(f"Upscaling deployment {mgr} back to 1") oc_deployment.exec_oc_cmd(f"scale --replicas=1 deployment/{mgr}") log.info("Waiting for mgr pod to be reach Running state") oc_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL ) # Check ceph metrics available check_ceph_metrics_available_within_time()
class TestJenkinsNodeReboot(E2ETest): """ Test running Jenkins and Node Reboot """ @pytest.fixture() def jenkins_setup(self, jenkins): """ JENKINS test setup """ # Initialize Sanity instance self.sanity_helpers = Sanity() # Deployment of jenkins jenkins.create_ocs_jenkins_template() @pytest.mark.parametrize( argnames=["node_type", "num_projects", "num_of_builds"], argvalues=[ pytest.param(*[MASTER_MACHINE, 2, 15], marks=pytest.mark.polarion_id("OCS-2202")), pytest.param(*[WORKER_MACHINE, 2, 15], marks=pytest.mark.polarion_id("OCS-2178")), ], ) @pytest.mark.usefixtures(jenkins_setup.__name__) def test_run_jenkins_node_reboot(self, jenkins, nodes, node_type, num_projects, num_of_builds): """ Test Node Reboot jenkins """ # Init number of projects jenkins.number_projects = num_projects # Create app jenkins jenkins.create_app_jenkins() # Create jenkins pvc jenkins.create_jenkins_pvc() # Create jenkins build config jenkins.create_jenkins_build_config() # Wait jenkins deploy pod reach to completed state jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED) # Get relevant node nodes_reboot = jenkins.get_node_name_where_jenkins_pod_not_hosted( node_type=node_type, num_of_nodes=1) # Init number of builds per project jenkins.number_builds_per_project = num_of_builds # Start Builds jenkins.start_build() if len(nodes_reboot) > 0: # Restart Node nodes.restart_nodes(get_node_objs(nodes_reboot)) else: log.info("No node was reboot") # Wait build reach 'Complete' state jenkins.wait_for_build_to_complete() # Print table of builds jenkins.print_completed_builds_results() # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
class TestRegistryRebootNode(E2ETest): """ Test to run svt workload for pushing images to registry when node is rebooted """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def setup(self, project_factory, node_restart_teardown): """ Setup and clean up """ self.project_name = "test" project_factory(project_name=self.project_name) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*[MASTER_MACHINE], marks=pytest.mark.polarion_id("OCS-1803")), pytest.param(*[WORKER_MACHINE], marks=pytest.mark.polarion_id("OCS-1795")), ], ) def test_registry_reboot_node(self, node_type, nodes): """ Test registry workload when backed by OCS and reboot node """ # Get the node list node = get_nodes(node_type, num_of_nodes=1) # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template="eap-cd-basic-s2i", image= "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest", pattern="eap-app", ) # Validate image exists in registries path validate_image_exists(namespace=self.project_name) # Reboot one node nodes.restart_nodes(node, wait=False) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_cluster_connectivity)(tries=400) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate storage pods are running wait_for_storage_pods() # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists(namespace=self.project_name) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*[MASTER_MACHINE], marks=pytest.mark.polarion_id("OCS-1802")), pytest.param(*[WORKER_MACHINE], marks=pytest.mark.polarion_id("OCS-1804")), ], ) def test_registry_rolling_reboot_node(self, node_type, nodes): """ Test registry workload when backed by OCS and reboot node one by one """ # Get the node list node_list = get_nodes(node_type) # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template="eap-cd-basic-s2i", image= "registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest", pattern="eap-app", ) # Validate image exists in registries path validate_image_exists(namespace=self.project_name) for node in node_list: # Reboot node log.info(node.name) nodes.restart_nodes([node], wait=False) # Wait some time after rebooting node waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate all nodes and services are in READY state and up retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_cluster_connectivity)(tries=400) retry( ( CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException, ), tries=60, delay=15, )(wait_for_nodes_status)(timeout=900) # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40) # Validate storage pods are running wait_for_storage_pods() # Validate image registry pods validate_registry_pod_status() # Validate image exists in registries path validate_image_exists(namespace=self.project_name)