def test_monitoring_after_respinning_ceph_pods(self, test_fixture): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ namespace_list, pvc_objs, pod_objs, sc = test_fixture # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ['mgr', 'mon', 'osd'] disruption = disruption_helpers.Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics after respinning ceph pods for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after the respinning ceph pods namespaces = helpers.create_multilpe_projects(number_of_project=2) namespace_list.extend(namespaces) # Create pvcs after the respinning ceph pods pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after the respinning ceph pods pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )
def test_monitoring_when_osd_down(self, pods): """ Test case to validate monitoring when osd is down """ # Get osd pods osd_pod_list = pod.get_osd_pods() # Make one of the osd down(first one) resource_name = osd_pod_list[0].get().get('metadata').get('name') assert modify_osd_replica_count(resource_name=resource_name, replica_count=0) # Validate osd is down pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_obj.wait_for_delete(resource_name=resource_name), ( f"Resources is not deleted {resource_name}") # Check for the created pvc metrics when osd is down for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) # Make osd up which was down assert modify_osd_replica_count(resource_name=resource_name, replica_count=1) # Validate osd is up and ceph health is ok self.sanity_helpers.health_check()
def pods(self, multi_pvc_factory, dc_pod_factory): """ Prepare multiple dc pods for the test Returns: list: Pod instances """ sc = default_storage_class(interface_type=constants.CEPHBLOCKPOOL) pvc_objs = multi_pvc_factory(interface=constants.CEPHBLOCKPOOL, storageclass=sc, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) pod_objs = [] for pvc_obj in pvc_objs: pod_objs.append(dc_pod_factory(pvc=pvc_obj)) # Check for the created pvc metrics on prometheus pod for pod_obj in pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) return pod_objs
def test_monitoring_after_rebooting_node_where_mgr_is_running(self): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ aws_obj = aws.AWS() # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted instances = aws.get_instances_ids_and_names([mgr_node_obj]) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_fixture(request, storageclass_factory): """ Setup and teardown """ def teardown(): # Delete created app pods and pvcs assert pod.delete_pods(pod_objs) assert pvc.delete_pvcs(pvc_objs) # Switch to default project ret = ocp.switch_to_default_rook_cluster_project() assert ret, 'Failed to switch to default rook cluster project' # Delete created projects for prj in namespace_list: prj.delete(resource_name=prj.namespace) # Validate all nodes are in READY state wait_for_nodes_status() request.addfinalizer(teardown) # Create a storage class sc = storageclass_factory() # Create projects namespace_list = helpers.create_multilpe_projects(number_of_project=1) # Create pvcs pvc_objs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespace_list ] for pvc_obj in pvc_objs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() # Create app pods pod_objs = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvc_objs ] for pod_obj in pod_objs: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() # Check for the created pvc metrics on prometheus pod for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) return namespace_list, pvc_objs, pod_objs, sc
def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
def test_monitoring_after_rebooting_master_node(self, pod_factory): """ Test case to validate reboot master node and its interaction with prometheus pods """ aws_obj = aws.AWS() # Get the master node list master_nodes = get_typed_nodes(node_type='master') # Reboot one after one master nodes for node in master_nodes: instances = aws.get_instances_ids_and_names([node]) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_master_node_to_be_running_state() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) pod_obj = pod_factory(interface=constants.CEPHBLOCKPOOL, status=constants.STATUS_RUNNING) self.pod_objs.extend([pod_obj]) # Check for the new created pvc metrics on prometheus pod assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_fixture(self, pod_factory, num_of_pod=2): """ Create resources for the test """ self.pod_objs = [ pod_factory(interface=constants.CEPHBLOCKPOOL, status=constants.STATUS_RUNNING) for _ in range(num_of_pod) ] # Check for the created pvc metrics on prometheus pod for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def wait_for_nodes_status_and_prometheus_health_check(pods): """ Waits for the all the nodes to be in running state and also check prometheus health """ # Validate all nodes are in READY state wait_for_nodes_status(timeout=900) # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) assert prometheus_health_check(), "Prometheus health is degraded"
def test_monitoring_shutdown_and_recovery_prometheus_node( self, nodes, pods): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact """ # Get all prometheus pods prometheus_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for prometheus_pod_obj in prometheus_pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj) # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted nodes.stop_nodes([prometheus_node_obj]) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=[prometheus_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check all the prometheus pods are up for pod_obj in prometheus_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_monitoring_after_respinning_ceph_pods(self, pods): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ['mgr', 'mon', 'osd'] disruption = Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_monitoring_when_one_of_the_prometheus_node_down( self, nodes, pods): """ Test case to validate when the prometheus pod is down and its interaction with prometheus """ # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted pod_node_obj = pod.get_pod_node(pod_obj) # Make one of the node down where the prometheus pod is hosted nodes.restart_nodes([pod_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check all the prometheus pods are up for pod_obj in pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" )
def test_monitoring_after_respinning_ceph_pods(self, pods): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ["mgr", "mon", "osd"] disruption = Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" # Validate osd is up and ceph health is ok self.sanity_helpers.health_check(tries=40)
def test_monitoring_after_restarting_prometheus_pod(self, pods): """ Test case to validate prometheus pod restart should not have any functional impact """ # Get the prometheus pod prometheus_pod_obj = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"] ) for pod_object in prometheus_pod_obj: # Get the pvc which mounted on prometheus pod pod_info = pod_object.get() pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] # Restart the prometheus pod pod_object.delete(force=True) pod_obj = ocp.OCP( kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE ) assert pod_obj.wait_for_resource( condition="Running", selector="app=prometheus", timeout=60 ) # Check the same pvc is mounted on new pod pod_info = pod_object.get() assert ( pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] in pvc_name ), f"Old pvc not found after restarting the prometheus pod {pod_object.name}" for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_monitoring_when_one_of_the_prometheus_node_down( self, test_fixture): """ Test case to validate when the prometheus pod is down and interaction with prometheus """ namespace_list, pvc_objs, pod_objs, sc = test_fixture aws_obj = aws.AWS() # Get all the openshift-monitoring pods monitoring_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) # Get the worker node list workers = get_typed_nodes(node_type='worker') # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] prometheus_node = [ node for node in workers if node.get().get('metadata').get('name') == prometheus_node ] # Make one of the node down where the prometheus pod is hosted instances = aws.get_instances_ids_and_names(prometheus_node) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the monitoring pods are up for pod_obj in monitoring_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) # Check for the created pvc metrics after nodes restarting for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after restarting nodes namespaces = helpers.create_multilpe_projects(number_of_project=1) namespace_list.extend(namespaces) # Create pvcs after restarting nodes pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after restarting nodes pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod after restarting nodes for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )