Example #1
0
    def test_add_capacity_with_resource_delete(
        self,
        add_capacity_setup,
        workload_storageutilization_rbd,
        resource_name,
        resource_id,
        is_kill_resource_repeatedly,
    ):
        """
        The function get the resource name, and id.
        The function adds capacity to the cluster, and then delete the resource while
        storage capacity is getting increased.

        Args:
            resource_name (str): the name of the resource to delete
            resource_id (int): the id of the resource to delete
            is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False
                delete the resource only once.

        """
        used_percentage = get_percent_used_capacity()
        logging.info(
            f"storageutilization is completed. used capacity = {used_percentage}"
        )

        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)

        d = Disruptions()
        d.set_resource(resource_name)

        self.new_pods_in_status_running = False

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # Wait for new osd's to come up. After the first new osd in status Init - delete the resource.
        # After deleting the resource we expect that all the new osd's will be in status running,
        # and the delete resource will be also in status running.
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info(
            f"Delete a {resource_name} pod while storage capacity is getting increased"
        )
        if is_kill_resource_repeatedly:
            with ThreadPoolExecutor() as executor:
                executor.submit(self.kill_resource_repeatedly, resource_name,
                                resource_id)
                self.wait_for_osd_pods_to_be_running(storagedeviceset_count)
        else:
            d.delete_resource(resource_id)
            self.wait_for_osd_pods_to_be_running(storagedeviceset_count)

        self.new_pods_in_status_running = True
        logging.info(
            "Finished verifying add capacity when one of the pods gets deleted"
        )
        logging.info("Waiting for ceph health check to finished...")
        check_ceph_health_after_add_capacity()
Example #2
0
    def test_new_sc_new_rbd_pool(
        self,
        replica,
        compression,
        volume_binding_mode,
        pvc_status,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        This test function does below,
        *. Creates Storage Class with creating new rbd pool
        *. Creates PVCs using new Storage Class
        *. Mount PVC to an app pod
        *. Run IO on an app pod
        """
        interface_type = constants.CEPHBLOCKPOOL
        sc_obj = storageclass_factory(
            interface=interface_type,
            new_rbd_pool=True,
            replica=replica,
            compression=compression,
            volume_binding_mode=volume_binding_mode,
        )

        log.info(f"Creating a PVC using {sc_obj.name}")
        pvc_obj = pvc_factory(interface=interface_type,
                              storageclass=sc_obj,
                              size=10,
                              status=pvc_status)
        log.info(f"PVC: {pvc_obj.name} created successfully using "
                 f"{sc_obj.name}")

        # Create app pod and mount each PVC
        log.info(f"Creating an app pod and mount {pvc_obj.name}")
        pod_obj = pod_factory(interface=interface_type, pvc=pvc_obj)
        log.info(
            f"{pod_obj.name} created successfully and mounted {pvc_obj.name}")

        # Run IO on each app pod for sometime
        log.info(f"Running FIO on {pod_obj.name}")
        pod_obj.run_io(
            "fs",
            size="1G",
            rate="1500m",
            runtime=60,
            buffer_compress_percentage=60,
            buffer_pattern="0xdeadface",
            bs="8K",
            jobs=5,
            readwrite="readwrite",
        )
        cluster_used_space = get_percent_used_capacity()
        log.info(f"Cluster used space with replica size {replica}, "
                 f"compression mode {compression}={cluster_used_space}")
        cbp_name = sc_obj.get().get("parameters").get("pool")
        if compression != "none":
            validate_compression(cbp_name)
        validate_replica_data(cbp_name, replica)
    def calculate_crd_data(self):
        """
        Getting the storage capacity and calculate pod count and pvc size

        """

        ceph_used_capacity_percent = get_percent_used_capacity()
        logger.info(f"Ceph used capacity percent is {ceph_used_capacity_percent}%")

        ceph_capacity = self.ceph_cluster.get_ceph_capacity()
        logger.info(f"Total storage capacity is {ceph_capacity} GiB")

        self.percent_to_fill = self.percent_to_fill - ceph_used_capacity_percent
        logger.info(f"Percentage to fill is {self.percent_to_fill}%")

        self.total_data_set = int(ceph_capacity * (int(self.percent_to_fill) / 100))
        self.filesize = int(
            self.crd_data["spec"]["workload"]["args"]["filesize"].replace("GiB", "")
        )

        # Make sure that filesize>=10 and servers<=60
        self.servers = 60
        self.filesize = int(self.total_data_set / self.servers)
        if self.filesize < 10:
            self.filesize = 10
            self.servers = int(self.total_data_set / self.filesize)

        self.crd_data["spec"]["workload"]["args"]["filesize"] = f"{self.filesize}GiB"
        self.crd_data["spec"]["workload"]["args"][
            "storagesize"
        ] = f"{int(self.total_data_set)}Gi"
        self.crd_data["spec"]["workload"]["args"]["servers"] = self.servers
        self.crd_data["spec"]["workload"]["args"]["bs"] = "1024KiB"
        self.crd_data["spec"]["workload"]["args"]["jobs"] = ["write", "read"]
        self.crd_data["spec"]["workload"]["args"]["iodepth"] = 1
    def test_new_sc_new_rbd_pool_e2e_wl(
        self,
        storageclass_factory,
        amq_factory_fixture,
        couchbase_factory_fixture,
        pgsql_factory_fixture,
        replica,
        compression,
    ):
        """
        Testing workloads on new storage class with new cephblockpool
        """
        interface_type = constants.CEPHBLOCKPOOL
        sc_obj = storageclass_factory(
            interface=interface_type,
            new_rbd_pool=True,
            replica=replica,
            compression=compression,
        )
        self.amq, self.threads = amq_factory_fixture(sc_name=sc_obj.name)
        self.cb = couchbase_factory_fixture(sc_name=sc_obj.name,
                                            run_in_bg=True)
        self.pgsql = pgsql_factory_fixture(replicas=3,
                                           clients=3,
                                           transactions=600,
                                           sc_name=sc_obj.name)

        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        cluster_used_space = get_percent_used_capacity()
        log.info(
            f" Cluster used percentage space with replica size {replica}, "
            f"compression mode {compression}={cluster_used_space}")
Example #5
0
    def test_new_sc_new_rbd_pool_e2e_wl(
        self,
        storageclass_factory,
        amq_factory_fixture,
        couchbase_factory_fixture,
        pgsql_factory_fixture,
        replica,
        compression,
    ):
        """
        Testing workloads on new storage class with new cephblockpool
        """
        interface_type = constants.CEPHBLOCKPOOL
        sc_obj = storageclass_factory(
            interface=interface_type,
            new_rbd_pool=True,
            replica=replica,
            compression=compression,
        )
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ios_ops = ThreadPoolExecutor(max_workers=5)
        self.amq, self.threads = amq_factory_fixture(sc_name=sc_obj.name)

        cb_workload = executor_run_bg_ios_ops.submit(
            bg_handler.handler,
            couchbase_factory_fixture,
            sc_name=sc_obj.name,
            replicas=3,
            skip_analyze=True,
            run_in_bg=False,
            num_items="1000",
            num_threads="1",
            iterations=1,
        )

        pgsql_workload = executor_run_bg_ios_ops.submit(
            bg_handler.handler,
            pgsql_factory_fixture,
            replicas=1,
            clients=1,
            transactions=100,
            timeout=100,
            sc_name=sc_obj.name,
            iterations=1,
        )
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [pgsql_workload, cb_workload]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        # AMQ Validate the results
        log.info("Validate message run completely")
        for thread in self.threads:
            thread.result(timeout=1800)

        cluster_used_space = get_percent_used_capacity()
        log.info(
            f" Cluster used percentage space with replica size {replica}, "
            f"compression mode {compression}={cluster_used_space}"
        )
Example #6
0
    def test_add_capacity_osd_pod_delete(self, workload_storageutilization_rbd):
        """
        Test add capacity when one of the osd pods gets deleted
        in the middle of the process.
        """
        used_percentage = get_percent_used_capacity()
        logging.info(f"storageutilization is completed. used capacity = {used_percentage}")

        max_osds = 15
        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)
        if number_of_osd_pods_before >= max_osds:
            pytest.skip("We have maximum of osd's in the cluster")

        d = Disruptions()
        d.set_resource('osd')

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # OSD number go down by one and then gradually go up by 1
        # and finally the OSD number will be storagedeviceset_count*3
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info("Delete an osd pod while storage capacity is getting increased")
        d.delete_resource(1)

        pod = OCP(
            kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']
        )

        pod.wait_for_resource(
            timeout=420,
            condition=constants.STATUS_RUNNING,
            selector='app=rook-ceph-osd',
            resource_count=storagedeviceset_count * 3
        )

        logging.info("Finished verifying add capacity when one of the osd pods gets deleted")
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(
            namespace=config.ENV_DATA['cluster_namespace'], tries=80
        )
Example #7
0
    def cluster_filler(self):
        curl_cmd = (
            f""" curl {constants.REMOTE_FILE_URL} --output {constants.FILE_PATH} """
        )
        logging.info("downloading......")
        run_cmd(cmd=curl_cmd)
        logging.info("finished")
        with ThreadPoolExecutor() as executor:
            for pod in self.pods_to_fill:
                executor.submit(
                    pod_helpers.upload,
                    pod.name,
                    constants.FILE_PATH,
                    "/mnt/",
                    namespace=self.namespace,
                )
                logging.info(f"### initiated downloader for {pod.name}")

        filler_executor = ThreadPoolExecutor()
        while not self.cluster_filled:
            for copy_iter in range(self.concurrent_copies):
                for each_pod in self.pods_to_fill:
                    self.used_capacity = get_percent_used_capacity()
                    logging.info(
                        f"### used capacity %age = {self.used_capacity}")
                    if self.used_capacity <= self.percent_required_filled:
                        filler_executor.submit(self.filler, each_pod)
                        logging.info(
                            f"#### Ran copy operation on pod {each_pod.name}. copy_iter # {copy_iter}"
                        )
                    else:
                        logging.info(
                            f"############ Cluster filled to the expected capacity "
                            f"{self.percent_required_filled}")
                        self.cluster_filled = True
                        break
                if self.cluster_filled:
                    return True
Example #8
0
    def reach_cluster_load_percentage(self):
        """
        Reach the cluster limit and then drop to the given target percentage.
        The number of pods needed for the desired target percentage is determined by
        creating pods one by one, while examining the cluster latency. Once the latency
        is greater than 250 ms and it is growing exponentially, it means that
        the cluster limit has been reached.
        Then, dropping to the target percentage by deleting all pods and re-creating
        ones with smaller value of FIO 'rate' param.
        This leaves the number of pods needed running IO for cluster load to
        be around the desired percentage.

        """
        if not self.target_percentage:
            logger.warning("The target percentage was not provided. Breaking")
            return
        if not 0.1 < self.target_percentage < 0.95:
            logger.warning(
                f"The target percentage is {self.target_percentage * 100}% which is "
                "not within the accepted range. Therefore, IO will not be started"
            )
            return
        low_diff_counter = 0
        cluster_limit = None
        latency_vals = list()
        time_to_wait = 60 * 30
        time_before = time.time()

        self.current_iops = self.get_query(query=constants.IOPS_QUERY)

        # Creating FIO DeploymentConfig pods one by one, with a large value of FIO
        # 'rate' arg. This in order to determine the cluster limit faster.
        # Once determined, these pods will be deleted. Then, new FIO DC pods will be
        # created, with a smaller value of 'rate' param. This in order to be more
        # accurate with reaching the target percentage
        while True:
            wait = False if len(self.dc_objs) <= 1 else True
            self.increase_load_and_print_data(rate='250M', wait=wait)
            if self.current_iops > self.previous_iops:
                cluster_limit = self.current_iops

            latency = self.calc_trim_metric_mean(metric=constants.LATENCY_QUERY) * 1000
            latency_vals.append(latency)
            logger.info(f"Latency values: {latency_vals}")

            iops_diff = (self.current_iops / self.previous_iops * 100) - 100
            low_diff_counter += 1 if -15 < iops_diff < 10 else 0

            cluster_used_space = get_percent_used_capacity()

            if len(latency_vals) > 1 and latency > 250:
                # Checking for an exponential growth. In case the latest latency sample
                # value is more than 128 times the first latency value sample, we can conclude
                # that the cluster limit in terms of IOPS, has been reached.
                # See https://blog.docbert.org/vdbench-curve/ for more details.
                # In other cases, when the first latency sample value is greater than 3 ms,
                # the multiplication factor we check according to, is lower, in order to
                # determine the cluster load faster.
                if latency > latency_vals[0] * 2 ** 7 or (
                    3 < latency_vals[0] < 50 and len(latency_vals) > 5
                ):
                    logger.info(
                        wrap_msg("The cluster limit was determined by latency growth")
                    )
                    break

            # In case the latency is greater than 2 seconds,
            # most chances the limit has been reached
            elif latency > 2000:
                logger.info(
                    wrap_msg(f"The limit was determined by the high latency - {latency} ms")
                )
                break

            # For clusters that their nodes do not meet the minimum
            # resource requirements, the cluster limit is being reached
            # while the latency remains low. For that, the cluster limit
            # needs to be determined by the following condition of IOPS
            # diff between FIO pod creation iterations
            elif low_diff_counter > 3:
                logger.warning(
                    wrap_msg(
                        "Limit was determined by low IOPS diff between "
                        f"iterations - {iops_diff:.2f}%"
                    )
                )
                break

            elif time.time() > time_before + time_to_wait:
                logger.warning(
                    wrap_msg(
                        "Could not determine the cluster IOPS limit within"
                        f"the given {time_to_wait} seconds timeout. Breaking"
                    )
                )
                break

            elif cluster_used_space > 60:
                logger.warning(
                    wrap_msg(
                        f"Cluster used space is {cluster_used_space}%. Could "
                        "not reach the cluster IOPS limit before the "
                        "used spaced reached 60%. Breaking"
                    )
                )
                break

        self.cluster_limit = cluster_limit
        logger.info(wrap_msg(f"The cluster IOPS limit is {self.cluster_limit:.2f}"))
        logger.info("Deleting all DC FIO pods that have large FIO rate")
        while self.dc_objs:
            self.decrease_load(wait=False)

        target_iops = self.cluster_limit * self.target_percentage

        range_map = RangeKeyDict(
            {
                (0, 500): (6, 0.82, 0.4),
                (500, 1000): (8, 0.84, 0.45),
                (1000, 1500): (10, 0.86, 0.5),
                (1500, 2000): (12, 0.88, 0.55),
                (2000, 2500): (14, 0.90, 0.6),
                (2500, 3000): (16, 0.92, 0.65),
                (3000, 3500): (18, 0.94, 0.7),
                (3500, math.inf): (20, 0.96, 0.75),
            }
        )
        self.rate = f'{range_map[target_iops][0]}M'
        # Creating the first pod of small FIO 'rate' param, to speed up the process.
        # In the meantime, the load will drop, following the deletion of the
        # FIO pods with large FIO 'rate' param
        logger.info("Creating FIO pods, one by one, until the target percentage is reached")
        self.increase_load_and_print_data(rate=self.rate)
        msg = (
            f"The target load, in IOPS, is: {target_iops}, which is "
            f"{self.target_percentage*100}% of the {self.cluster_limit} cluster limit"
        )
        logger.info(wrap_msg(msg))

        while self.current_iops < target_iops * range_map[target_iops][1]:
            wait = False if self.current_iops < target_iops * range_map[target_iops][2] else True
            self.increase_load_and_print_data(rate=self.rate, wait=wait)

        msg = f"The target load, of {self.target_percentage * 100}%, has been reached"
        logger.info(wrap_msg(msg))
        self.target_pods_number = len(self.dc_objs)
Example #9
0
    def reach_cluster_load_percentage(self):
        """
        Reach the cluster limit and then drop to the given target percentage.
        The number of pods needed for the desired target percentage is determined by
        creating pods one by one, while examining the cluster latency. Once the latency
        is greater than 250 ms and it is growing exponentially, it means that
        the cluster limit has been reached.
        Then, dropping to the target percentage by deleting all pods and re-creating
        ones with smaller value of FIO 'rate' param.
        This leaves the number of pods needed running IO for cluster load to
        be around the desired percentage.

        """
        if not self.target_percentage:
            logger.warning("The target percentage was not provided. Breaking")
            return
        if not 0.1 < self.target_percentage < 0.95:
            logger.warning(
                f"The target percentage is {self.target_percentage * 100}% which is "
                "not within the accepted range. Therefore, IO will not be started"
            )
            return
        low_diff_counter = 0
        limit_reached = False
        cluster_limit = None
        latency_vals = list()
        time_to_wait = 60 * 30
        time_before = time.time()

        self.current_iops = self.get_query(query=constants.IOPS_QUERY)

        # Creating FIO DeploymentConfig pods one by one, with a large value of FIO
        # 'rate' arg. This in order to determine the cluster limit faster.
        # Once determined, these pods will be deleted. Then, new FIO DC pods will be
        # created, with a smaller value of 'rate' param. This in order to be more
        # accurate with reaching the target percentage
        rate = '250M'
        while not limit_reached:
            self.increase_load_and_print_data(rate=rate)
            if self.current_iops > self.previous_iops:
                cluster_limit = self.current_iops

            latency = self.calc_trim_metric_mean(
                metric=constants.LATENCY_QUERY) * 1000
            latency_vals.append(latency)
            logger.info(f"Latency values: {latency_vals}")

            if len(latency_vals) > 1 and latency > 250:
                # Checking for an exponential growth
                if latency > latency_vals[0] * 2**7:
                    logger.info("Latency exponential growth was detected")
                    limit_reached = True

            # In case the latency is greater than 3 seconds,
            # most chances the limit has been reached
            if latency > 3000:
                logger.info("Limit was determined by latency, which is "
                            f"higher than 3 seconds - {latency} ms")
                limit_reached = True

            # For clusters that their nodes do not meet the minimum
            # resource requirements, the cluster limit is being reached
            # while the latency remains low. For that, the cluster limit
            # needs to be determined by the following condition of IOPS
            # diff between FIO pod creation iterations
            iops_diff = (self.current_iops / self.previous_iops * 100) - 100
            low_diff_counter += 1 if -15 < iops_diff < 10 else 0
            if low_diff_counter > 3:
                logger.warning("Limit was determined by low IOPS diff between "
                               f"iterations - {iops_diff:.2f}%")
                limit_reached = True

            if time.time() > time_before + time_to_wait:
                logger.warning(
                    "Could not determine the cluster IOPS limit within"
                    f"\nthe given {time_to_wait} seconds timeout. Breaking")
                limit_reached = True

            cluster_used_space = get_percent_used_capacity()
            if cluster_used_space > 60:
                logger.warning(
                    f"Cluster used space is {cluster_used_space}%. Could "
                    "not reach the cluster IOPS limit before the "
                    "used spaced reached 60%. Breaking")
                limit_reached = True

        self.cluster_limit = cluster_limit
        logger.info(
            wrap_msg(f"The cluster IOPS limit is {self.cluster_limit:.2f}"))
        logger.info(
            f"Deleting all DC FIO pods that have FIO rate parameter of {rate}")
        while self.dc_objs:
            self.decrease_load(wait=False)

        # Creating the first pod of small FIO 'rate' param, to speed up the process.
        # In the meantime, the load will drop, following the deletion of the
        # FIO pods with large FIO 'rate' param
        rate = '15M'
        logger.info(
            f"Creating FIO pods with a rate parameter of {rate}, one by "
            "one, until the target percentage is reached")
        self.increase_load(rate=rate)
        target_iops = self.cluster_limit * self.target_percentage
        self.current_iops = self.get_query(query=constants.IOPS_QUERY)
        msg = (
            f"The target load, in IOPS, is: {target_iops}, which is "
            f"{self.target_percentage*100}% of the {self.cluster_limit} cluster limit"
        )
        logger.info(wrap_msg(msg))

        while self.current_iops < target_iops * 0.95:
            wait = False if self.current_iops < target_iops / 2 else True
            self.increase_load_and_print_data(rate=rate, wait=wait)

        msg = f"The target load, of {self.target_percentage * 100}%, has been reached"
        logger.info(wrap_msg(msg))
        self.target_pods_number = len(self.dc_objs)