def get_rook_ceph_pod_names_not_in_node(node_name): """ Get all the rook ceph pod names that are not running on the node Args: node_name (str): The node name Returns: list: List of the rook ceph pod names that are not running on the node 'node_name' """ rook_ceph_pod_names_set = set(get_rook_ceph_pod_names()) node_pods = get_node_pods(node_name) node_pod_names_set = set([p.name for p in node_pods]) rook_ceph_pod_names_not_in_node = list(rook_ceph_pod_names_set - node_pod_names_set) return rook_ceph_pod_names_not_in_node
def test_check_pods_status_after_node_failure(self, nodes, node_restart_teardown): """ Test check pods status after a node failure event. All the rook ceph pods should be in "Running" or "Completed" state after a node failure event. """ ocs_nodes = get_ocs_nodes() if not ocs_nodes: pytest.skip("We don't have ocs nodes in the cluster") ocs_node = random.choice(ocs_nodes) node_name = ocs_node.name log.info(f"Selected node is '{node_name}'") # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node( node_name) node_osd_ids = get_node_osd_ids(node_name) node_mon_ids = get_node_mon_ids(node_name) log.info(f"Shutting down node '{node_name}'") nodes.stop_nodes([ocs_node]) wait_for_nodes_status(node_names=[node_name], status=constants.NODE_NOT_READY) log.info( f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status" ) log.info("Wait for a change in the rook ceph pod statuses...") timeout = 480 is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods( node_name, timeout=timeout) assert ( is_rook_ceph_pods_status_changed ), f"Rook Ceph pods status didn't change after {timeout} seconds" log.info( "Check the rook ceph pods are in 'Running' or 'Completed' state") timeout = 480 are_pods_running = wait_for_pods_to_be_running( pod_names=rook_ceph_pod_names_not_in_node, timeout=timeout, sleep=30) assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds" # Get the rook ceph pods without the osd, and mon pods have the old node ids osd_pods = get_osd_pods() new_node_osd_id_names_set = { p.name for p in osd_pods if get_osd_pod_id(p) in node_osd_ids } mon_pods = get_mon_pods() new_node_mon_id_names_set = { p.name for p in mon_pods if get_mon_pod_id(p) in node_mon_ids } new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union( new_node_mon_id_names_set) rook_ceph_pod_names_set = set(get_rook_ceph_pod_names()) new_rook_ceph_pod_names = list(rook_ceph_pod_names_set - new_node_osd_mon_id_names_set) log.info( "Verify that the new rook ceph pods are in 'Running' or 'Completed' state" ) timeout = 300 are_new_pods_running = wait_for_pods_to_be_running( pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20) assert (are_new_pods_running ), f"The new pods are not 'Running' after {timeout} seconds" log.info("All the pods are in 'Running' or 'Completed' state") log.info(f"Starting the node '{node_name}' again...") nodes.start_nodes(nodes=[ocs_node]) wait_for_nodes_status(node_names=[node_name]) log.info( "Waiting for all the pods to be running and cluster health to be OK..." ) wait_for_pods_to_be_running(timeout=600) self.sanity_helpers.health_check(tries=40)
def test_check_pods_status_after_node_failure(self, nodes, node_restart_teardown): """ Test check pods status after a node failure event. All the rook ceph pods should be in "Running" or "Completed" state after a node failure event. """ ocs_nodes = get_ocs_nodes() if not ocs_nodes: pytest.skip("We don't have ocs nodes in the cluster") wnodes = get_worker_nodes() ocs_node = random.choice(ocs_nodes) node_name = ocs_node.name log.info(f"Selected node is '{node_name}'") # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(node_name) node_osd_ids = get_node_osd_ids(node_name) node_mon_ids = get_node_mon_ids(node_name) log.info(f"Shutting down node '{node_name}'") nodes.stop_nodes([ocs_node]) wait_for_nodes_status(node_names=[node_name], status=constants.NODE_NOT_READY) log.info(f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status") log.info("Wait for a change in the rook ceph pod statuses...") timeout = 480 is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods( node_name, timeout=timeout ) assert ( is_rook_ceph_pods_status_changed ), f"Rook Ceph pods status didn't change after {timeout} seconds" log.info("Check the rook ceph pods are in 'Running' or 'Completed' state") previous_timeout = 480 timeout = 600 are_pods_running = wait_for_pods_to_be_running( pod_names=rook_ceph_pod_names_not_in_node, timeout=timeout, sleep=30 ) assert are_pods_running, ( f"Increased timeout from {previous_timeout} to {timeout} seconds, " f"The pods are not 'Running' even after {timeout} seconds" ) # Get the rook ceph pods without the osd, and mon pods have the old node ids osd_pods = get_osd_pods() new_node_osd_id_names_set = { p.name for p in osd_pods if get_osd_pod_id(p) in node_osd_ids } mon_pods = get_mon_pods() new_node_mon_id_names_set = { p.name for p in mon_pods if get_mon_pod_id(p) in node_mon_ids } new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union( new_node_mon_id_names_set ) rook_ceph_pod_names_set = set(get_rook_ceph_pod_names()) new_rook_ceph_pod_names = list( rook_ceph_pod_names_set - new_node_osd_mon_id_names_set ) log.info( "Verify that the new rook ceph pods are in 'Running' or 'Completed' state" ) timeout = 300 are_new_pods_running = wait_for_pods_to_be_running( pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20 ) assert ( are_new_pods_running ), f"The new pods are not 'Running' after {timeout} seconds" log.info("All the pods are in 'Running' or 'Completed' state") if is_managed_service_cluster(): log.info( "When we use the managed service, the worker node should recover automatically " "by starting the node or removing it, and creating a new one." "Waiting for all the worker nodes to be ready..." ) wait_for_node_count_to_reach_status(node_count=len(wnodes), timeout=900) log.info("Waiting for all the pods to be running") assert check_pods_after_node_replacement(), "Not all the pods are running" else: log.info(f"Starting the node '{node_name}' again...") nodes.start_nodes(nodes=[ocs_node]) wait_for_nodes_status(node_names=[node_name]) log.info("Waiting for all the pods to be running") wait_for_pods_to_be_running(timeout=600) log.info("Checking that the cluster health is OK...") self.sanity_helpers.health_check(tries=40)