def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ node.get_typed_nodes( node_type=node_type, num_of_nodes=1 )[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable node.schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def finalizer(): # Validate all nodes are schedulable scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") assert prometheus_health_check(), "Prometheus health is degraded"
def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) node.drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name reboot_events_cmd = ( f"get events -A --field-selector involvedObject.name=" f"{typed_node_name},reason=Rebooted -o yaml") # Find the number of reboot events in 'typed_node_name' num_events = len( typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"]) # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=False) try: wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_NOT_READY_SCHEDULING_DISABLED, ) except ResourceWrongStatusException: # Sometimes, the node will be back to running state quickly so # that the status change won't be detected. Verify the node was # actually restarted by checking the reboot events count new_num_events = len( typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"]) assert new_num_events > num_events, ( f"Reboot event not found." f"Node {typed_node_name} did not restart.") wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED, ) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def finalizer(): scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes)
def test_node_maintenance(self, reduce_and_resume_cluster_load, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=90)
def test_run_couchbase_node_drain(self, cb_setup, node_type='master'): """ Test couchbase workload with node drain """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top( node_type='worker', print_table=True ) # Node drain with specific node type typed_nodes = node.get_typed_nodes( node_type=node_type, num_of_nodes=1 ) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() for sample in TimeoutSampler(300, 5, self.cb.result.done): if sample: break else: logging.info( "#### ....Waiting for couchbase threads to complete..." ) utils.ceph_health_check()
def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_2_nodes_maintenance_same_type( self, pvc_factory, pod_factory, nodes_type ): """ OCS-1273/OCs-1271: - Maintenance (mark as unscheduable and drain) 2 worker/master nodes - Mark the nodes as scheduable - Check cluster and Ceph health - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 2 nodes typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(typed_node_names) # Mark the nodes back to schedulable node.schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_run_pgsql_node_drain(self, pgsql, transactions=5600, node_type="worker"): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Select a node where pgbench is not running for drain typed_nodes = [ node1.name for node1 in node.get_nodes(node_type=node_type) ] filter_list = pgsql.filter_pgbench_nodes_from_nodeslist(typed_nodes) typed_node_name = filter_list[random.randint(0, len(filter_list) - 1)] log.info(f"Selected node {typed_node_name} for node drain operation") # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40) # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
def test_run_pgsql_node_drain(self, pgsql, transactions=900, node_type="master"): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Node drain with specific node type typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
def check_automated_recovery_from_drain_node(nodes): """ 1) Drain one worker node. 2) Delete the OSD pods associated with the node. 3) The new OSD pods with the same ids that come up, should be in a Pending state. 4) Schedule the worker node. 5) The OSD pods associated with the node, should back into a Running state, and come up on the same node. """ osd_node_name = random.choice(get_osd_running_nodes()) old_osd_pod_ids = get_node_osd_ids(osd_node_name) log.info(f"osd pod ids: {old_osd_pod_ids}") node_osd_pods = get_osd_pods_having_ids(old_osd_pod_ids) unschedule_nodes([osd_node_name]) log.info(f"Successfully unschedule the node: {osd_node_name}") log.info("Delete the node osd pods") delete_pods(node_osd_pods) new_osd_pods = wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids) new_osd_pod_names = [p.name for p in new_osd_pods] wnodes = get_worker_nodes() if len(wnodes) <= 3: expected_pods_status = constants.STATUS_PENDING else: expected_pods_status = constants.STATUS_RUNNING log.info( f"Verify the new osd pods {new_osd_pod_names} go into a {expected_pods_status} state" ) res = wait_for_pods_to_be_in_statuses( [expected_pods_status], new_osd_pod_names, raise_pod_not_found_error=True, ) assert res, f"Not all the node osd pods are in a {expected_pods_status} state" log.info(f"Wait for the node: {osd_node_name} to be scheduled") schedule_nodes([osd_node_name]) log.info(f"Successfully scheduled the node {osd_node_name}") if len(wnodes) <= 3: assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids) log.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" )
def test_run_jenkins_drain_node( self, jenkins, node_type, num_projects, num_of_builds ): """ Test Node Drain jenkins """ # Init number of projects jenkins.number_projects = num_projects # Create app jenkins jenkins.create_app_jenkins() # Create jenkins pvc jenkins.create_jenkins_pvc() # Create jenkins build config jenkins.create_jenkins_build_config() # Wait jenkins deploy pod reach to completed state jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED) # Get relevant node nodes_drain = jenkins.get_node_name_where_jenkins_pod_not_hosted( node_type=node_type, num_of_nodes=1 ) # Init number of builds per project jenkins.number_builds_per_project = num_of_builds # Start Builds jenkins.start_build() if len(nodes_drain) > 0: # Node maintenance - to gracefully terminate all pods on the node drain_nodes(nodes_drain) # Make the node schedulable again schedule_nodes(nodes_drain) # Wait build reach 'Complete' state jenkins.wait_for_build_to_complete() # Print table of builds jenkins.print_completed_builds_results() # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
def finalizer(): """ Make sure that all cluster's nodes are in 'Ready' state and if not, change them back to 'Ready' state by marking them as schedulable """ scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Remove label created for DC app pods on all worker nodes node_objs = get_node_objs() for node_obj in node_objs: if "dc" in node_obj.get().get("metadata").get("labels").keys(): remove_label_from_worker_node([node_obj.name], label_key="dc")
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_run_couchbase_node_drain(self, cb_setup, node_type="master"): """ Test couchbase workload with node drain """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Node drain with specific node type typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] bg_handler.wait_for_bg_operations(bg_ops, timeout=3600) self.sanity_helpers.health_check()
def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: log.info(f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_run_pgsql_node_drain( self, pgsql, transactions=900, node_type='master' ): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark( replicas=3, transactions=transactions, clients=3 ) # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Node drain with specific node type typed_nodes = node.get_typed_nodes( node_type=node_type, num_of_nodes=1 ) typed_node_name = typed_nodes[0].name # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([typed_node_name]) # Make the node schedulable again node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods)
def measure_stop_ceph_mon(measurement_dir, create_mon_quorum_loss, threading_lock): """ Downscales Ceph Monitor deployment, measures the time when it was downscaled and monitors alerts that were triggered during this event. Returns: dict: Contains information about `start` and `stop` time for stopping Ceph Monitor pod """ oc = ocp.OCP( kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"], threading_lock=threading_lock, ) mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"] mons = [deployment["metadata"]["name"] for deployment in mon_deployments] # get monitor deployments to stop, # if mon quorum to be lost split_index will be 1 # else leave even number of monitors split_index = ( 1 if create_mon_quorum_loss else len(mons) // 2 if len(mons) > 3 else 2 ) mons_to_stop = mons[split_index:] logger.info(f"Monitors to stop: {mons_to_stop}") logger.info(f"Monitors left to run: {mons[:split_index]}") # run_time of operation run_time = 60 * 14 def stop_mon(): """ Downscale Ceph Monitor deployments for 14 minutes. First 15 minutes the alert CephMonQuorumAtRisk should be in 'Pending'. After 15 minutes the alert turns into 'Firing' state. This configuration of monitoring can be observed in ceph-mixins which are used in the project: https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L16 `Firing` state shouldn't actually happen because monitor should be automatically redeployed shortly after 10 minutes. Returns: str: Names of downscaled deployments """ nonlocal oc nonlocal mons_to_stop for mon in mons_to_stop: logger.info(f"Downscaling deployment {mon} to 0") oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mon}") logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return mons_to_stop test_file = os.path.join( measurement_dir, f"measure_stop_ceph_mon_{split_index}.json" ) if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS: # It seems that it takes longer to propagate incidents to PagerDuty. # Adding 6 extra minutes so that alert is actually triggered and # unscheduling worker nodes so that monitor is not replaced worker_node_names = [ node.name for node in get_nodes(node_type=constants.WORKER_MACHINE) ] unschedule_nodes(worker_node_names) measured_op = measure_operation(stop_mon, test_file, minimal_time=60 * 20) schedule_nodes(worker_node_names) else: measured_op = measure_operation(stop_mon, test_file) # expected minimal downtime of a mon inflicted by this fixture measured_op["min_downtime"] = run_time - (60 * 2) # get new list of monitors to make sure that new monitors were deployed mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"] mons = [deployment["metadata"]["name"] for deployment in mon_deployments] # check that downscaled monitors are removed as OCS should redeploy them # but only when we are running this for the first time check_old_mons_deleted = all(mon not in mons for mon in mons_to_stop) if measured_op["first_run"] and not check_old_mons_deleted: for mon in mons_to_stop: logger.info(f"Upscaling deployment {mon} back to 1") oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mon}") if ( not split_index == 1 and config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS ): msg = f"Downscaled monitors {mons_to_stop} were not replaced" assert check_old_mons_deleted, msg # wait for ceph to return into HEALTH_OK state after mon deployment # is returned back to normal ceph_health_check(tries=20, delay=15) return measured_op
def test_rook_operator_restart_during_mon_failover(self, node_drain_teardown): """ Verify the number of monitoring pod is three when drain node """ sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=1, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "the expected pdb state is not equal to actual pdb state" log.info("Get worker node name where monitoring pod run") mon_pod_objs = get_mon_pods() node_name = mon_pod_objs[0].data["spec"]["nodeName"] drain_nodes([node_name]) sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=0, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "the expected pdb state is not equal to actual pdb state" timeout = 1400 log.info(f"Verify the number of mon pods is 3 for {timeout} seconds") sample = TimeoutSampler(timeout=timeout, sleep=10, func=check_number_of_mon_pods) if sample.wait_for_func_status(result=False): assert "There are more than 3 mon pods." log.info("Respin pod rook-ceph operator pod") rook_ceph_operator_pod_obj = get_operator_pods() rook_ceph_operator_pod_obj[0].delete() schedule_nodes([node_name]) log.info("Wait for all the pods in openshift-storage to be running.") assert wait_for_pods_to_be_running(timeout=300) sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=1, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "the expected pdb state is not equal to actual pdb state" ceph_health_check() assert check_number_of_mon_pods( ), "The number of mon pods not equal to 3"
def test_pdb_check_simultaneous_node_drains( self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, node_drain_teardown, ): """ - Check for OSD PDBs before drain - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs - Drain will be completed on worker node A - Drain will be pending on worker node B due to blocking PDBs - Check mon failover in first 10 mins, then 15 and 20 mins - Check the OSD PDBs - Mark the node A as schedulable - Let drain finish on Node B - Again check mon failover in first 10 mins and then in intervals - Mark the node B as schedulable - Check cluster and Ceph health """ # Validate OSD PDBs before drain operation assert (not validate_existence_of_blocking_pdb() ), "Blocking PDBs exist, Can't perform drain" # Get 2 worker nodes to drain typed_nodes = get_nodes(num_of_nodes=2) assert len( typed_nodes) == 2, "Failed to find worker nodes for the test" node_A = typed_nodes[0].name node_B = typed_nodes[1].name # Drain Node A and validate blocking PDBs drain_nodes([node_A]) assert (validate_existence_of_blocking_pdb() ), "Blocking PDBs not created post drain" # Inducing delay between 2 drains # Node-B drain expected to be in pending due to blocking PDBs time.sleep(30) try: drain_nodes([node_B]) # After the drain check Mon failover in 10th, 15th and 20th min timeout = [600, 300, 300] for failover in timeout: sample = TimeoutSampler( timeout=failover, sleep=10, func=helpers.check_number_of_mon_pods, ) if not sample.wait_for_func_status(result=True): assert "Number of mon pods not equal to expected_mon_count=3" except TimeoutExpired: # Mark the node-A back to schedulable and let drain finish in Node-B schedule_nodes([node_A]) time.sleep(40) # Validate OSD PDBs assert (validate_existence_of_blocking_pdb() ), "Blocking PDBs not created post second drain" # Mark the node-B back to schedulable and recover the cluster schedule_nodes([node_B]) sample = TimeoutSampler( timeout=100, sleep=10, func=validate_existence_of_blocking_pdb, ) if not sample.wait_for_func_status(result=False): log.error("Blocking PDBs still exist") # After the drain check mon failover in 10th, 15th and 20th Min timeout = [600, 300, 300] for failover in timeout: sample = TimeoutSampler( timeout=failover, sleep=10, func=helpers.check_number_of_mon_pods, ) if not sample.wait_for_func_status(result=True): assert "Number of Mon pods not equal to expected_mon_count=3" sample = TimeoutSampler( timeout=100, sleep=10, func=verify_pdb_mon, disruptions_allowed=1, max_unavailable_mon=1, ) if not sample.wait_for_func_status(result=True): assert "The expected mon-pdb is not equal to actual mon pdb" # wait for storage pods pod.wait_for_storage_pods() # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=50) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources()
def run_in_bg(self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory, sc_name=None): log.info( "Starting multiple creation & clone of postgres PVC in Background") bg_handler = flowtest.BackgroundOps() executor_run_bg_ops = ThreadPoolExecutor(max_workers=1) pgsql_snapshot_and_clone = executor_run_bg_ops.submit( bg_handler.handler, multiple_snapshot_and_clone_of_postgres_pvc_factory, pvc_size_new=25, pgsql=self.pgsql, sc_name=sc_name, iterations=1, ) log.info("Started creation of snapshots & clones in background") flow_ops = flowtest.FlowOperations() log.info("Starting operation 1: Pod Restarts") disruption = Disruptions() pod_obj_list = [ "osd", "mon", "mgr", "operator", "rbdplugin", "rbdplugin_provisioner", ] for pod in pod_obj_list: disruption.set_resource(resource=f"{pod}") disruption.delete_resource() log.info("Verifying exit criteria for operation 1: Pod Restarts") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Pod Restarts") log.info("Starting operation 2: Node Reboot") node_names = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=3, operation_name="Node Reboot") # Reboot node nodes.restart_nodes(node_names) log.info("Verifying exit criteria for operation 2: Node Reboot") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Reboot") log.info("Starting operation 3: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain") # Node maintenance - to gracefully terminate all pods on the node drain_nodes([node_name[0].name]) # Make the node schedulable again schedule_nodes([node_name[0].name]) log.info("Verifying exit criteria for operation 3: Node Drain") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Drain") log.info("Waiting for background operations to be completed") bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)
def test_rgw_host_node_failure(self, nodes, node_restart_teardown, node_drain_teardown, mcg_obj, bucket_factory): """ Test case to fail node where RGW and the NooBaa DB are hosted and verify the new pods spin on a healthy node """ # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted noobaa_pod_node = None for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name in [ constants.NB_DB_NAME_46_AND_BELOW, constants.NB_DB_NAME_47_AND_ABOVE, ]: noobaa_pod_node = get_pod_node(noobaa_pod) if noobaa_pod_node is None: assert False, "Could not find the NooBaa DB pod" # Validate if RGW pod and noobaa-db are hosted on same node # If not, make sure both pods are hosted on same node log.info("Validate if RGW pod and noobaa-db are hosted on same node") rgw_pod_obj = get_rgw_pods() rgw_pod_node_list = [ rgw_pod.get().get("spec").get("nodeName") for rgw_pod in rgw_pod_obj ] if not list( set(rgw_pod_node_list).intersection( noobaa_pod_node.name.split())): log.info("Unschedule other two nodes such that RGW " "pod moves to node where NooBaa DB pod hosted") worker_node_list = get_worker_nodes() node_names = list( set(worker_node_list) - set(noobaa_pod_node.name.split())) unschedule_nodes(node_names=node_names) ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) rgw_pod_obj[0].delete() ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, timeout=300, sleep=5, ) log.info("Schedule those nodes again") schedule_nodes(node_names=node_names) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Check again the rgw pod move to node where NooBaa DB pod hosted rgw_pod_obj_list = get_rgw_pods() rgw_pod_node_list = [ get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list ] value = [ True if rgw_pod_node == noobaa_pod_node.name else False for rgw_pod_node in rgw_pod_node_list ] assert value, ("RGW Pod didn't move to node where NooBaa DB pod" " hosted even after cordoned and uncordoned nodes" f"RGW pod hosted: {rgw_pod_node_list}" f"NooBaa DB pod hosted: {noobaa_pod_node.name}") log.info( "RGW and noobaa-db are hosted on same node start the test execution" ) rgw_pod_obj = get_rgw_pods() for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info(f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and NooBaa DB are hosted") node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state(resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720) # Validate new rgw pod spun ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Start the node nodes.start_nodes(node_obj) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check()
def test_pvc_snapshot_and_clone( self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory ): """ 1. Deploy PGSQL workload 2. Take a snapshot of the pgsql PVC. 3. Create a new PVC out of that snapshot or restore snapshot 4. Create a clone of restored snapshot 5. Attach a new pgsql pod to it. 5. Resize cloned pvc 7. Create snapshots of cloned pvc and restore those snapshots 8. Attach a new pgsql pod to it and Resize the new restored pvc 9. Repeat the above steps in bg when performing base operation: restart pods, worker node reboot, node drain, device replacement """ log.info("Starting multiple creation & clone of postgres PVC in Background") bg_handler = flowtest.BackgroundOps() executor_run_bg_ops = ThreadPoolExecutor(max_workers=1) pgsql_snapshot_and_clone = executor_run_bg_ops.submit( bg_handler.handler, multiple_snapshot_and_clone_of_postgres_pvc_factory, pvc_size_new=25, pgsql=self.pgsql, iterations=1, ) log.info("Started creation of snapshots & clones in background") flow_ops = flowtest.FlowOperations() log.info("Starting operation 1: Pod Restarts") disruption = Disruptions() pod_obj_list = [ "osd", "mon", "mgr", "operator", "rbdplugin", "rbdplugin_provisioner", ] for pod in pod_obj_list: disruption.set_resource(resource=f"{pod}") disruption.delete_resource() log.info("Verifying exit criteria for operation 1: Pod Restarts") flow_ops.validate_cluster( node_status=True, pod_status=True, operation_name="Pod Restarts" ) log.info("Starting operation 2: Node Reboot") node_names = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=3, operation_name="Node Reboot" ) # Reboot node nodes.restart_nodes(node_names) log.info("Verifying exit criteria for operation 2: Node Reboot") flow_ops.validate_cluster( node_status=True, pod_status=True, operation_name="Node Reboot" ) log.info("Starting operation 3: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain" ) # Node maintenance - to gracefully terminate all pods on the node drain_nodes([node_name[0].name]) # Make the node schedulable again schedule_nodes([node_name[0].name]) log.info("Verifying exit criteria for operation 3: Node Drain") flow_ops.validate_cluster( node_status=True, pod_status=True, operation_name="Node Drain" ) log.info("Waiting for background operations to be completed") bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)
def test_mcg_namespace_disruptions_crd( self, mcg_obj, cld_mgr, awscli_pod, bucketclass_dict, bucket_factory, node_drain_teardown, ): """ Test MCG namespace disruption flow 1. Create NS resources with CRDs 2. Create NS bucket with CRDs 3. Upload to NS bucket 4. Delete noobaa related pods and verify integrity of objects 5. Create public access policy on NS bucket and verify Get op 6. Drain nodes containing noobaa pods and verify integrity of objects 7. Perform put operation to validate public access denial 7. Edit/verify and remove objects on NS bucket """ data = "Sample string content to write to a S3 object" object_key = "ObjKey-" + str(uuid.uuid4().hex) awscli_node_name = awscli_pod.get()["spec"]["nodeName"] aws_s3_creds = { "access_key_id": cld_mgr.aws_client.access_key, "access_key": cld_mgr.aws_client.secret_key, "endpoint": constants.MCG_NS_AWS_ENDPOINT, "region": config.ENV_DATA["region"], } # S3 account details user_name = "nb-user" + str(uuid.uuid4().hex) email = user_name + "@mail.com" logger.info("Setting up test files for upload, to the bucket/resources") setup_base_objects(awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3) # Create the namespace resource and verify health ns_buc = bucket_factory( amount=1, interface=bucketclass_dict["interface"], bucketclass=bucketclass_dict, )[0] ns_bucket = ns_buc.name aws_target_bucket = ns_buc.bucketclass.namespacestores[0].uls_name logger.info(f"Namespace bucket: {ns_bucket} created") logger.info(f"Uploading objects to ns bucket: {ns_bucket}") sync_object_directory( awscli_pod, src=MCG_NS_ORIGINAL_DIR, target=f"s3://{ns_bucket}", s3_obj=mcg_obj, ) for pod_to_respin in self.labels_map: logger.info(f"Re-spinning mcg resource: {self.labels_map[pod_to_respin]}") pod_obj = pod.Pod( **pod.get_pods_having_label( label=self.labels_map[pod_to_respin], namespace=defaults.ROOK_CLUSTER_NAMESPACE, )[0] ) pod_obj.delete(force=True) assert pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, selector=self.labels_map[pod_to_respin], resource_count=1, timeout=300, ) logger.info( f"Downloading objects from ns bucket: {ns_bucket} " f"after re-spinning: {self.labels_map[pod_to_respin]}" ) sync_object_directory( awscli_pod, src=f"s3://{ns_bucket}", target=MCG_NS_RESULT_DIR, s3_obj=mcg_obj, ) logger.info( f"Verifying integrity of objects " f"after re-spinning: {self.labels_map[pod_to_respin]}" ) compare_directory( awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3 ) # S3 account user = NoobaaAccount(mcg_obj, name=user_name, email=email, buckets=[ns_bucket]) logger.info(f"Noobaa account: {user.email_id} with S3 access created") # Admin sets Public access policy(*) bucket_policy_generated = gen_bucket_policy( user_list=["*"], actions_list=["GetObject"], resources_list=[f'{ns_bucket}/{"*"}'], ) bucket_policy = json.dumps(bucket_policy_generated) logger.info( f"Creating bucket policy on bucket: {ns_bucket} with wildcard (*) Principal" ) put_policy = put_bucket_policy(mcg_obj, ns_bucket, bucket_policy) logger.info(f"Put bucket policy response from Admin: {put_policy}") logger.info(f"Getting bucket policy on bucket: {ns_bucket}") get_policy = get_bucket_policy(mcg_obj, ns_bucket) logger.info(f"Got bucket policy: {get_policy['Policy']}") # MCG admin writes an object to bucket logger.info(f"Writing object on bucket: {ns_bucket} by admin") assert s3_put_object(mcg_obj, ns_bucket, object_key, data), "Failed: PutObject" # Verifying whether Get operation is allowed to any S3 user logger.info( f"Get object action on namespace bucket: {ns_bucket} " f"with user: {user.email_id}" ) assert s3_get_object(user, ns_bucket, object_key), "Failed: GetObject" # Upload files to NS target logger.info( f"Uploading objects directly to ns resource target: {aws_target_bucket}" ) sync_object_directory( awscli_pod, src=MCG_NS_ORIGINAL_DIR, target=f"s3://{aws_target_bucket}", signed_request_creds=aws_s3_creds, ) for pod_to_drain in self.labels_map: pod_obj = pod.Pod( **pod.get_pods_having_label( label=self.labels_map[pod_to_drain], namespace=defaults.ROOK_CLUSTER_NAMESPACE, )[0] ) # Retrieve the node name on which the pod resides node_name = pod_obj.get()["spec"]["nodeName"] if awscli_node_name == node_name: logger.info( f"Skipping node drain since aws cli pod node: " f"{awscli_node_name} is same as {pod_to_drain} " f"pod node: {node_name}" ) continue # Drain the node drain_nodes([node_name]) wait_for_nodes_status( [node_name], status=constants.NODE_READY_SCHEDULING_DISABLED ) schedule_nodes([node_name]) wait_for_nodes_status(timeout=300) # Retrieve the new pod pod_obj = pod.Pod( **pod.get_pods_having_label( label=self.labels_map[pod_to_drain], namespace=defaults.ROOK_CLUSTER_NAMESPACE, )[0] ) wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout=120) # Verify all storage pods are running wait_for_storage_pods() logger.info( f"Downloading objects from ns bucket: {ns_bucket} " f"after draining node: {node_name} with pod {pod_to_drain}" ) sync_object_directory( awscli_pod, src=f"s3://{ns_bucket}", target=MCG_NS_RESULT_DIR, s3_obj=mcg_obj, ) logger.info( f"Verifying integrity of objects " f"after draining node with pod: {pod_to_drain}" ) compare_directory( awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3 ) logger.info(f"Editing the namespace resource bucket: {ns_bucket}") namespace_bucket_update( mcg_obj, bucket_name=ns_bucket, read_resource=[aws_target_bucket], write_resource=aws_target_bucket, ) logger.info(f"Verifying object download after edit on ns bucket: {ns_bucket}") sync_object_directory( awscli_pod, src=f"s3://{ns_bucket}", target=MCG_NS_RESULT_DIR, s3_obj=mcg_obj, ) # Verifying whether Put object action is denied logger.info( f"Verifying whether user: {user.email_id} has only public read access" ) logger.info(f"Removing objects from ns bucket: {ns_bucket}") rm_object_recursive(awscli_pod, target=ns_bucket, mcg_obj=mcg_obj)
def test_crashcollector_pod_existence_on_ceph_pods_running_nodes( self, add_nodes, node_drain_teardown): """ Add node with OCS label and verify crashcollector created on new node """ failure_domain = get_failure_domin() logger.info(f"The failure domain is {failure_domain}") if failure_domain in ("zone", "rack"): old_node_rack_zone_dict = get_node_rack_or_zone_dict( failure_domain) logger.info( f"The old node rack/zone dict is {old_node_rack_zone_dict}") old_nodes = get_node_names() logger.info("Add one worker node with OCS label") add_nodes(ocs_nodes=True, node_count=1) new_node_name = list(set(get_node_names()) - set(old_nodes))[0] new_node = get_node_objs([new_node_name])[0] logger.info(f"New worker node is {new_node_name}") logger.info( f"Checking if the rack/zone of the node {new_node_name} is exist") timeout = 120 sample = TimeoutSampler( timeout=timeout, sleep=10, func=self.is_node_rack_or_zone_exist, node_obj=get_node_objs([new_node_name])[0], failure_domain=failure_domain, ) assert sample.wait_for_func_status( result=True ), f"Didn't find the node rack/zone after {timeout} seconds" if failure_domain in ("zone", "rack"): new_node_rack_zone_dict = get_node_rack_or_zone_dict( failure_domain) logger.info( f"The new node rack/zone dict is {new_node_rack_zone_dict}") new_rack_zone = get_node_rack_or_zone(failure_domain, new_node) logger.info( f"New worker node {new_node_name} in zone/rack {new_rack_zone}" ) for node, rack_zone in old_node_rack_zone_dict.items(): if rack_zone == new_rack_zone: drain_node = node else: drain_node = old_nodes[0] drain_nodes([drain_node]) logging.info("Wait for 3 mon pods to be on running state") pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) assert pod.wait_for_resource( condition="Running", selector=constants.MON_APP_LABEL, resource_count=3, timeout=1400, ) assert verify_rook_ceph_crashcollector_pods_where_rook_ceph_pods_are_running( ) schedule_nodes([drain_node]) logging.info("Wait for 3 osd pods to be on running state") assert pod.wait_for_resource( condition="Running", selector=constants.OSD_APP_LABEL, resource_count=3, timeout=600, ) assert verify_rook_ceph_crashcollector_pods_where_rook_ceph_pods_are_running( )
def test_base_operation_node_drain( self, node_drain_teardown, node_restart_teardown, nodes, pgsql_factory_fixture, project_factory, multi_pvc_factory, mcg_obj, bucket_factory, ): """ Test covers following flow operations while running workloads in the background: 1. Node drain 2. Add capacity 3. Node reboot 4. Node n/w failure """ logger.info("Starting IO operations in Background") project = project_factory() bg_handler = flowtest.BackgroundOps() executor_run_bg_ios_ops = ThreadPoolExecutor(max_workers=3) pgsql_workload = executor_run_bg_ios_ops.submit( bg_handler.handler, pgsql_factory_fixture, replicas=1, clients=1, transactions=100, timeout=100, iterations=1, ) logging.info("Started pgsql workload in background") flow_ops = flowtest.FlowOperations() obc_ios = executor_run_bg_ios_ops.submit( bg_handler.handler, flow_ops.sanity_helpers.obc_put_obj_create_delete, mcg_obj, bucket_factory, iterations=30, ) logging.info("Started object IOs in background") pvc_create_delete = executor_run_bg_ios_ops.submit( bg_handler.handler, flow_ops.sanity_helpers.create_pvc_delete, multi_pvc_factory, project, iterations=70, ) logging.info("Started pvc create and delete in background") logger.info("Starting operation 1: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain") # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([node_name[0].name]) # Make the node schedulable again node.schedule_nodes([node_name[0].name]) logger.info("Verifying exit criteria for operation 1: Node Drain") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Drain") logger.info("Starting operation 2: Add Capacity") osd_pods_before, restart_count_before = flow_ops.add_capacity_entry_criteria( ) # Add capacity osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled: replica_count = 1 else: replica_count = 3 pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * replica_count, ) logger.info("Verifying exit criteria for operation 2: Add Capacity") flow_ops.add_capacity_exit_criteria(restart_count_before, osd_pods_before) logger.info("Starting operation 3: Node Restart") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Restart") # Node failure (reboot) nodes.restart_nodes(nodes=node_name) logger.info("Verifying exit criteria for operation 3: Node Restart") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Restart") logger.info("Starting operation 4: Node network fail") node_name, nw_fail_time = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, network_fail_time=300, operation_name="Node N/W failure", ) # Node n/w interface failure node.node_network_failure(node_name[0].name) logger.info(f"Waiting for {nw_fail_time} seconds") sleep(nw_fail_time) # Reboot the unresponsive node(s) logger.info( f"Stop and start the unresponsive node(s): {node_name[0].name}") nodes.restart_nodes_by_stop_and_start(nodes=node_name) logger.info( "Verifying exit criteria for operation 4: Node network fail") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node N/W failure") logger.info( "Waiting for final iteration of background operations to be completed" ) bg_ops = [pvc_create_delete, obc_ios, pgsql_workload] bg_handler.wait_for_bg_operations(bg_ops, timeout=600)
def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_pdb_check_simultaneous_node_drains( self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, node_drain_teardown, ): """ - Check for OSD PDBs before drain - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs - Drain will be completed on worker node A - Drain will be pending on worker node B due to blocking PDBs - Check the OSD PDBs - Mark the node A as schedulable - Let drain finish on Node B - Mark the node B as schedulable - Check cluster and Ceph health """ # Validate OSD PDBs before drain operation assert (not validate_existence_of_blocking_pdb() ), "Blocking PDBs exist, Can't perform drain" # Get 2 worker nodes to drain typed_nodes = get_nodes(num_of_nodes=2) assert len( typed_nodes) == 2, "Failed to find worker nodes for the test" node_A = typed_nodes[0].name node_B = typed_nodes[1].name # Drain Node A and validate blocking PDBs drain_nodes([node_A]) assert (validate_existence_of_blocking_pdb() ), "Blocking PDBs not created post drain" # Inducing delay between 2 drains # Node-B drain expected to be in pending due to blocking PDBs time.sleep(30) try: drain_nodes([node_B]) except TimeoutExpired: # Mark the node-A back to schedulable and let drain finish in Node-B schedule_nodes([node_A]) time.sleep(40) # Validate OSD PDBs assert (validate_existence_of_blocking_pdb() ), "Blocking PDBs not created post second drain" # Mark the node-B back to schedulable and recover the cluster schedule_nodes([node_B]) sample = TimeoutSampler( timeout=100, sleep=10, func=validate_existence_of_blocking_pdb, ) if not sample.wait_for_func_status(result=False): log.error("Blocking PDBs still exist") # wait for storage pods pod.wait_for_storage_pods() # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=50) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources()