def noobaa_running_node_restart(pod_name): """ Function to restart node which has noobaa pod's running Args: pod_name (str): Name of noobaa pod """ nb_pod_obj = pod.get_pod_obj( (get_pod_name_by_pattern( pattern=pod_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) nb_node_name = pod.get_pod_node(nb_pod_obj).name factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() nb_nodes = get_node_objs(node_names=nb_node_name) log.info(f"{pod_name} is running on {nb_node_name}") log.info(f"Restating node: {nb_node_name}....") nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True) # Validate nodes are up and running wait_for_nodes_status() ceph_health_check(tries=30, delay=60) helpers.wait_for_resource_state(nb_pod_obj, constants.STATUS_RUNNING, timeout=180)
def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=50): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check(), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def nodes(): """ Return an instance of the relevant platform nodes class (e.g. AWSNodes, VMWareNodes) to be later used in the test for nodes related operations, like nodes restart, detach/attach volume, etc. """ factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() return nodes
def test_rolling_reboot_node(self, node_type): """ Test to rolling reboot of nodes """ # Get info from SCALE_DATA_FILE for validation if os.path.exists(SCALE_DATA_FILE): file_data = templating.load_yaml(SCALE_DATA_FILE) namespace = file_data.get("NAMESPACE") pod_scale_list = file_data.get("POD_SCALE_LIST") pvc_scale_list = file_data.get("PVC_SCALE_LIST") else: raise FileNotFoundError node_list = list() # Rolling reboot nodes if node_type == constants.WORKER_MACHINE: tmp_list = get_nodes(node_type=node_type) ocs_node_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for tmp in tmp_list: if tmp.name in ocs_node_list: node_list.append(tmp) else: node_list = get_nodes(node_type=node_type) factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in node_list: nodes.restart_nodes(nodes=[node]) scale_lib.validate_node_and_oc_services_are_up_after_reboot() # Validate storage pods are running wait_for_storage_pods() # Validate cluster health ok and all pods are running assert utils.ceph_health_check( delay=180 ), "Ceph health in bad state after node reboots" # Validate all PVCs from namespace are in Bound state assert scale_lib.validate_all_pvcs_and_check_state( namespace=namespace, pvc_scale_list=pvc_scale_list ) # Validate all PODs from namespace are up and running assert scale_lib.validate_all_pods_and_check_state( namespace=namespace, pod_scale_list=pod_scale_list )
def cycle_nodes(cluster_path, action): """ Start/Stop AWS nodes to save costs when not in use. Args: cluster_path(str): location of cluster path that has auth files action (str): action to perform either start or stop """ node_obj_file = os.path.join(cluster_path, NODE_OBJ_FILE) nodes_file = os.path.join(cluster_path, NODE_FILE) instance_file = os.path.join(cluster_path, INSTANCE_FILE) if action == 'stop': ceph = CephCluster() ceph.set_noout() node_objs = get_node_objs() kls = platform_nodes.PlatformNodesFactory() nodes = kls.get_nodes_platform() with open(instance_file, "wb") as instance_file: log.info("Storing ocs instances objects") pickle.dump(nodes.get_ec2_instances(nodes=node_objs), instance_file) with open(nodes_file, "wb") as node_file: log.info("Storing ocp nodes objects") pickle.dump(nodes, node_file) with open(node_obj_file, "wb") as node_obj_file: log.info("Stopping all nodes") pickle.dump(node_objs, node_obj_file) nodes.stop_nodes(nodes=node_objs) elif action == 'start': with open(instance_file, "rb") as instance_file: log.info("Reading instance objects") instances = pickle.load(instance_file) with open(nodes_file, "rb") as node_file: log.info("Reading ocp nodes object") nodes = pickle.load(node_file) with open(node_obj_file, "rb") as node_obj_file: log.info("Starting ocs nodes") node_objs = pickle.load(node_obj_file) nodes.start_nodes(instances=instances, nodes=node_objs) unset_noout()