Example #1
0
    def test_scale_osds_reboot_nodes(self, interface, project_factory,
                                     multi_pvc_factory, dc_pod_factory):
        """
        Check storage utilization, if its less then runs IO,
        Scale osds from 3-6, check for rebalance and reboot workers
        """
        current_osd_count = count_cluster_osd()
        proj_obj = project_factory()
        if current_osd_count == 3:
            while not validate_osd_utilization(osd_used=50):
                # Create pvc
                pvc_objs = multi_pvc_factory(project=proj_obj,
                                             interface=interface,
                                             size=self.pvc_size,
                                             num_of_pvc=self.num_of_pvcs)

                dc_pod_objs = list()
                for pvc_obj in pvc_objs:
                    dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj))

                wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs)

                for pod_obj in dc_pod_objs:
                    pod_obj.run_io(storage_type='fs',
                                   size='3G',
                                   runtime='60',
                                   fio_filename=f'{pod_obj.name}_io')

        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=count * 3)
        assert ceph_health_check(), "New OSDs failed to reach running state"

        cluster = CephCluster()

        # Get rebalance status
        rebalance_status = cluster.get_rebalance_status()
        logger.info(rebalance_status)
        if rebalance_status:
            time_taken = cluster.time_taken_to_complete_rebalance()
            logger.info(f"The time taken to complete rebalance {time_taken}")

        # Rolling reboot on worker nodes
        worker_nodes = get_typed_nodes(node_type='worker')

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in worker_nodes:
            nodes.restart_nodes(nodes=[node])
            wait_for_nodes_status()

        assert ceph_health_check(
            delay=180), "Failed, Ceph health bad after nodes reboot"
Example #2
0
    def create_scale_pods(
        self,
        scale_count=1500,
        pods_per_iter=5,
        io_runtime=None,
        pvc_size=None,
        start_io=None,
    ):
        """
        Main Function with scale pod creation flow and checks to add nodes.
        For other platforms will not be considering the instance_type param

        Args:
            scale_count (int): Scale pod+pvc count
            io_runtime (sec): Fio run time in seconds
            start_io (bool): If True start IO else don't
            pods_per_iter (int): Number of PVC-POD to be created per PVC type
            pvc_size (Gi): size of PVC
            Example, If 5 then 20 PVC+POD will be created with 5 each of 4 PVC types
            Test value in-between 5-10

        """
        self.ms_name, all_pod_obj = ([] for i in range(2))
        if not 5 <= pods_per_iter <= 10:
            raise UnexpectedBehaviour(
                "Pods_per_iter value should be in-between 5-15")

        # Check for expected worker count
        expected_worker_count = get_expected_worker_count(scale_count)
        if check_and_add_enough_worker(expected_worker_count):
            if (config.ENV_DATA["deployment_type"] == "ipi"
                    and config.ENV_DATA["platform"].lower() == "aws"):
                for obj in machine.get_machineset_objs():
                    if "app" in obj.name:
                        self.ms_name.append(obj.name)
            else:
                self.ms_name = []

        # Create namespace
        self.create_and_set_namespace()

        # Continue to iterate till the scale pvc limit is reached
        while True:
            if scale_count <= len(all_pod_obj):
                logger.info(f"Scaled {scale_count} pvc and pods")

                if cluster.validate_pg_balancer():
                    logging.info(
                        "OSD consumption and PG distribution is good to continue"
                    )
                else:
                    raise UnexpectedBehaviour(
                        "Unequal PG distribution to OSDs")

                break
            else:
                logger.info(f"Scaled PVC and POD count {len(all_pod_obj)}")
                self.pod_obj, self.pvc_obj = self.create_multi_pvc_pod(
                    pods_per_iter, io_runtime, start_io, pvc_size)
                all_pod_obj.extend(self.pod_obj)
                try:
                    # Check enough resources available in the dedicated app workers
                    check_enough_resource_available_in_workers(
                        self.ms_name, self.pod_dict_path)

                    # Check for ceph cluster OSD utilization
                    if not cluster.validate_osd_utilization(osd_used=75):
                        logging.info("Cluster OSD utilization is below 75%")
                    elif not cluster.validate_osd_utilization(osd_used=83):
                        logger.warning("Cluster OSD utilization is above 75%")
                    else:
                        raise CephHealthException("Cluster OSDs are near full")

                    # Check for 500 pods per namespace
                    pod_objs = pod.get_all_pods(
                        namespace=self.namespace_list[-1].namespace)
                    if len(pod_objs) >= 500:
                        self.create_and_set_namespace()

                except UnexpectedBehaviour:
                    logging.error(
                        f"Scaling of cluster failed after {len(all_pod_obj)} pod creation"
                    )
                    raise UnexpectedBehaviour(
                        "Scaling PVC+POD failed analyze setup and log for more details"
                    )