def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=50): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check(), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def create_scale_pods( self, scale_count=1500, pods_per_iter=5, io_runtime=None, pvc_size=None, start_io=None, ): """ Main Function with scale pod creation flow and checks to add nodes. For other platforms will not be considering the instance_type param Args: scale_count (int): Scale pod+pvc count io_runtime (sec): Fio run time in seconds start_io (bool): If True start IO else don't pods_per_iter (int): Number of PVC-POD to be created per PVC type pvc_size (Gi): size of PVC Example, If 5 then 20 PVC+POD will be created with 5 each of 4 PVC types Test value in-between 5-10 """ self.ms_name, all_pod_obj = ([] for i in range(2)) if not 5 <= pods_per_iter <= 10: raise UnexpectedBehaviour( "Pods_per_iter value should be in-between 5-15") # Check for expected worker count expected_worker_count = get_expected_worker_count(scale_count) if check_and_add_enough_worker(expected_worker_count): if (config.ENV_DATA["deployment_type"] == "ipi" and config.ENV_DATA["platform"].lower() == "aws"): for obj in machine.get_machineset_objs(): if "app" in obj.name: self.ms_name.append(obj.name) else: self.ms_name = [] # Create namespace self.create_and_set_namespace() # Continue to iterate till the scale pvc limit is reached while True: if scale_count <= len(all_pod_obj): logger.info(f"Scaled {scale_count} pvc and pods") if cluster.validate_pg_balancer(): logging.info( "OSD consumption and PG distribution is good to continue" ) else: raise UnexpectedBehaviour( "Unequal PG distribution to OSDs") break else: logger.info(f"Scaled PVC and POD count {len(all_pod_obj)}") self.pod_obj, self.pvc_obj = self.create_multi_pvc_pod( pods_per_iter, io_runtime, start_io, pvc_size) all_pod_obj.extend(self.pod_obj) try: # Check enough resources available in the dedicated app workers check_enough_resource_available_in_workers( self.ms_name, self.pod_dict_path) # Check for ceph cluster OSD utilization if not cluster.validate_osd_utilization(osd_used=75): logging.info("Cluster OSD utilization is below 75%") elif not cluster.validate_osd_utilization(osd_used=83): logger.warning("Cluster OSD utilization is above 75%") else: raise CephHealthException("Cluster OSDs are near full") # Check for 500 pods per namespace pod_objs = pod.get_all_pods( namespace=self.namespace_list[-1].namespace) if len(pod_objs) >= 500: self.create_and_set_namespace() except UnexpectedBehaviour: logging.error( f"Scaling of cluster failed after {len(all_pod_obj)} pod creation" ) raise UnexpectedBehaviour( "Scaling PVC+POD failed analyze setup and log for more details" )