def setting_storage_usage(self):
        """
        Getting the storage capacity, calculate the usage of the storage and
        setting the workload CR rile parameters.

        """

        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        log.info(f"Total storage capacity is {ceph_capacity} GiB")
        self.total_data_set = int(ceph_capacity * 0.4)
        self.filesize = int(
            self.fio_cr["spec"]["workload"]["args"]["filesize"].replace(
                "GiB", ""))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if self.total_data_set > 500:
            self.filesize = int(ceph_capacity * 0.008)
            self.fio_cr["spec"]["workload"]["args"][
                "filesize"] = f"{self.filesize}GiB"
            # make sure that the storage size is larger then the file size
            self.fio_cr["spec"]["workload"]["args"][
                "storagesize"] = f"{int(self.filesize * 1.2)}Gi"
        self.fio_cr["spec"]["workload"]["args"]["servers"] = int(
            self.total_data_set / self.filesize)
        log.info(f"Total Data set to work on is : {self.total_data_set} GiB")
Exemple #2
0
    def setting_storage_usage(self):
        """
        Getting the storage capacity, calculate the usage of the storage and
        setting the workload CR rile parameters.

        """

        # for development mode - use parameters for short test run
        if self.dev_mode:
            log.info("Setting up parameters for development mode")
            self.crd_data["spec"]["workload"]["args"]["filesize"] = "1GiB"
            self.crd_data["spec"]["workload"]["args"]["storagesize"] = "5Gi"
            self.crd_data["spec"]["workload"]["args"]["servers"] = 2
            self.crd_data["spec"]["workload"]["args"]["samples"] = 2
            self.crd_data["spec"]["workload"]["args"]["read_runtime"] = 30
            self.crd_data["spec"]["workload"]["args"]["write_runtime"] = 30
            self.crd_data["spec"]["workload"]["args"]["bs"] = ["64KiB"]
            self.total_data_set = 20
            self.filesize = 3
            return

        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        log.info(f"Total storage capacity is {ceph_capacity} GiB")
        self.total_data_set = int(ceph_capacity * 0.4)
        self.filesize = int(
            self.crd_data["spec"]["workload"]["args"]["filesize"].replace(
                "GiB", ""))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if self.total_data_set > 500:
            self.filesize = int(ceph_capacity * 0.008)
            self.crd_data["spec"]["workload"]["args"][
                "filesize"] = f"{self.filesize}GiB"
            # make sure that the storage size is larger then the file size
            self.crd_data["spec"]["workload"]["args"][
                "storagesize"] = f"{int(self.filesize * 1.2)}Gi"
        self.crd_data["spec"]["workload"]["args"]["servers"] = int(
            self.total_data_set / self.filesize)
        log.info(f"Total Data set to work on is : {self.total_data_set} GiB")
    def test_vdbench_workload(self, template, with_ocs, load, label_nodes,
                              ripsaw, servers, threads, blocksize, fileio,
                              samples, width, depth, files, file_size, runtime,
                              pause):
        """
        Run VDBench Workload

        Args :
            template (str) : Name of yaml file that will used as a template
            with_ocs (bool) : This parameter will indicate if the test will
                              run on the same nodes as the OCS
            load (int) : load to run on the storage in percentage of the capacity.
            label_nodes (fixture) : This fixture is labeling the worker(s)
                                    that will used for App. pod(s)
            ripsaw (fixture) : Fixture to deploy the ripsaw benchmarking operator
            servers (int) : Number of servers (pods) that will run the IO
            threads (int) : Number of threads that will run on each server
            blocksize (list - str): List of BlockSize - must add the 'K' to it
            fileio (str) : How to select file for the IO : random / sequential
            samples (int) : Number of time(s) to run each test
            width (int) : Width of directory tree to create
            depth (int) : Depth of directory tree to create
            files (int) : Number of files to create in each directory
            file_size (int) : File size (in MB) to create
            runtime (int) : Time (in Sec.) for each test iteration
            pause (int) : Time (in Min.) to pause between each test iteration.
        """
        log.info(f'going to use {template} as template')
        log.info("Apply Operator CRD")

        crd = 'resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml'
        ripsaw.apply_crd(crd)

        log.info('Running vdbench benchmark')
        if template:
            template = os.path.join(constants.TEMPLATE_VDBENCH_DIR, template)
        else:
            template = constants.VDBENCH_BENCHMARK_YAML
        sf_data = templating.load_yaml(template)

        target_results = template + 'Results'

        log.info('Calculating Storage size....')
        ceph_cluster = CephCluster()
        total_capacity = ceph_cluster.get_ceph_capacity()
        assert total_capacity > constants.VDBENCH_MIN_CAPACITY, (
            "Storage capacity is too low for performance testing")
        log.info(f'The Total usable capacity is {total_capacity}')

        if load:
            width = constants.VDBENCH_WIDTH
            depth = constants.VDBENCH_DEPTH
            file_size = constants.VDBENCH_FILE_SIZE
            capacity_per_pod = constants.VDBENCH_CAP_PER_POD
            total_dirs = width**depth
            log.info(f'The total dirs in the tree {total_dirs}')
            log.info(f'Going to run with {load} % of the capacity load.')
            tested_capacity = round(total_capacity * 1024 * load / 100)
            log.info(f'Tested capacity is {tested_capacity} MB')
            servers = round(tested_capacity / capacity_per_pod)
            """
            To spread the application pods evenly on all workers or application nodes and at least 2 app pods
            per node.
            """
            nodes = len(
                node.get_typed_nodes(node_type=constants.WORKER_MACHINE))
            if not with_ocs:
                nodes = len(
                    machine.get_labeled_nodes(
                        f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}'
                    ))
            log.info(f'Going to use {nodes} nodes for the test !')
            servers = round(servers / nodes) * nodes
            if servers < (nodes * 2):
                servers = nodes * 2

            files = round(tested_capacity / servers / total_dirs)
            total_files = round(files * servers * total_dirs)
            log.info(f'number of pods is {servers}')
            log.info(f'Going to create {total_files} files !')
            log.info(f'number of files in dir is {files}')
        """
            Setting up the parameters for this test
        """
        if servers:
            sf_data['spec']['workload']['args']['servers'] = servers
            target_results = target_results + '-' + str(servers)
        if threads:
            sf_data['spec']['workload']['args']['threads'] = threads
            target_results = target_results + '-' + str(threads)
        if fileio:
            sf_data['spec']['workload']['args']['fileio'] = fileio
            target_results = target_results + '-' + str(fileio)
        if samples:
            sf_data['spec']['workload']['args']['samples'] = samples
            target_results = target_results + '-' + str(samples)
        if width:
            sf_data['spec']['workload']['args']['width'] = width
            target_results = target_results + '-' + str(width)
        if depth:
            sf_data['spec']['workload']['args']['depth'] = depth
            target_results = target_results + '-' + str(depth)
        if files:
            sf_data['spec']['workload']['args']['files'] = files
            target_results = target_results + '-' + str(files)
        if file_size:
            sf_data['spec']['workload']['args']['file_size'] = file_size
            target_results = target_results + '-' + str(file_size)
        if runtime:
            sf_data['spec']['workload']['args']['runtime'] = runtime
            target_results = target_results + '-' + str(runtime)
        if pause:
            sf_data['spec']['workload']['args']['pause'] = pause
            target_results = target_results + '-' + str(pause)
        if len(blocksize) > 0:
            sf_data['spec']['workload']['args']['bs'] = blocksize
            target_results = target_results + '-' + '_'.join(blocksize)
        if with_ocs:
            if sf_data['spec']['workload']['args']['pin_server']:
                del sf_data['spec']['workload']['args']['pin_server']
        """
            Calculating the size of the volume that need to be test, it should
            be at least twice in the size then the size of the files, and at
            least 100Gi.
            since the file_size is in Kb and the vol_size need to be in Gb,
            more calculation is needed.
        """
        vol_size = int((files * total_dirs) * file_size * 1.3)
        log.info('number of files to create : {}'.format(
            int(files * (width**depth))))
        log.info(f'The size of all files is : {vol_size}MB')
        vol_size = int(vol_size / 1024)
        if vol_size < 100:
            vol_size = 100
        sf_data['spec']['workload']['args']['storagesize'] = f'{vol_size}Gi'

        log.debug(f'output of configuration file is {sf_data}')

        timeout = 86400  # 3600 (1H) * 24 (1D)  = one days

        sf_obj = OCS(**sf_data)
        sf_obj.create()
        # wait for benchmark pods to get created - takes a while
        for bench_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern,
                                        'vdbench-client', 'my-ripsaw'):
            try:
                if bench_pod[0] is not None:
                    vdbench_client_pod = bench_pod[0]
                    break
            except IndexError:
                log.info('Benchmark client pod not ready yet')

        bench_pod = OCP(kind='pod', namespace='my-ripsaw')
        log.info('Waiting for VDBench benchmark to Run')
        assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                           resource_name=vdbench_client_pod,
                                           sleep=30,
                                           timeout=600)
        start_time = time.time()
        while True:
            logs = bench_pod.exec_oc_cmd(f'logs {vdbench_client_pod}',
                                         out_yaml_format=False)
            if 'Test Run Finished' in logs:
                log.info('VdBench Benchmark Completed Successfully')
                break

            if timeout < (time.time() - start_time):
                raise TimeoutError(
                    'Timed out waiting for benchmark to complete')
            time.sleep(30)

        # Getting the results file from the benchmark pod and put it with the
        # test logs.
        # TODO: find the place of the actual test log and not in the parent
        #       logs path
        target_results = '{}/{}.tgz'.format(ocsci_log_path(), target_results)
        pod_results = constants.VDBENCH_RESULTS_FILE
        retrive_files_from_pod(vdbench_client_pod, target_results, pod_results)
Exemple #4
0
    def test_pvc_multiple_snapshot_performance(
        self,
        interface_iterate,
        teardown_factory,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        1. Creating PVC
           size is depend on storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 80% of data
        3. Take a snapshot of the PVC and measure the time of creation.
        4. re-write the data on the PVC
        5. Take a snapshot of the PVC and measure the time of creation.
        6. repeat steps 4-5 the numbers of snapshot we want to take : 512
           this will be run by outside script for low memory consumption
        7. print all information.

        Raises:
            StorageNotSufficientException: in case of not enough capacity

        """
        # Number od snapshot for CephFS is 100 and for RBD is 512
        num_of_snaps = 100
        if interface_iterate == constants.CEPHBLOCKPOOL:
            num_of_snaps = 512

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = int(ceph_cluster.get_ceph_capacity())

        # Use 70% of the storage capacity in the test
        capacity_to_use = int(ceph_capacity * 0.7)

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        need_capacity = int((num_of_snaps + 2) * 1.35)
        # Test will run only on system with enough capacity
        if capacity_to_use < need_capacity:
            err_msg = (f"The system have only {ceph_capacity} GiB, "
                       f"we want to use only {capacity_to_use} GiB, "
                       f"and we need {need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        pvc_size = int(capacity_to_use / (num_of_snaps + 2))

        self.interface = interface_iterate
        self.sc_obj = storageclass_factory(self.interface)

        self.pvc_obj = pvc_factory(interface=self.interface,
                                   size=pvc_size,
                                   status=constants.STATUS_BOUND)

        self.pod_obj = pod_factory(interface=self.interface,
                                   pvc=self.pvc_obj,
                                   status=constants.STATUS_RUNNING)

        # Calculating the file size as 80% of the PVC size
        filesize = self.pvc_obj.size * 0.80
        # Change the file size to MB for the FIO function
        file_size = f"{int(filesize * constants.GB2MB)}M"
        file_name = self.pod_obj.name

        log.info(f"Total capacity size is : {ceph_capacity} GiB, "
                 f"Going to use {need_capacity} GiB, "
                 f"With {num_of_snaps} Snapshots to {pvc_size} GiB PVC. "
                 f"File size to be written is : {file_size} "
                 f"with the name of {file_name}")

        os.environ["SNAPNUM"] = f"{num_of_snaps}"
        os.environ["LOGPATH"] = f"{ocsci_log_path()}"
        os.environ["FILESIZE"] = file_size
        os.environ["NSPACE"] = self.pvc_obj.namespace
        os.environ["PODNAME"] = self.pod_obj.name
        os.environ["PVCNAME"] = self.pvc_obj.name
        os.environ["INTERFACE"] = self.interface

        main_script = "tests/e2e/performance/test_multi_snapshots.py"
        result = subprocess.run([main_script], stdout=subprocess.PIPE)
        log.info(f"Results from main script : {result.stdout.decode('utf-8')}")

        if "All results are" not in result.stdout.decode("utf-8"):
            log.error("Test did not completed")
            raise Exception("Test did not completed")
    def test_pvc_multiple_clone_performance(
        self,
        interface_iterate,
        teardown_factory,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        1. Creating PVC
           PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 70% of data
        3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation
            times from relevant logs
        4. Repeat the previous step number of times (maximal num_of_clones is 512)
        5. Print all measured statistics for all the clones.

        Raises:
            StorageNotSufficientException: in case of not enough capacity on the cluster

        """
        num_of_clones = 512

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = int(ceph_cluster.get_ceph_capacity())

        # Use 70% of the storage capacity in the test
        capacity_to_use = int(ceph_capacity * 0.7)

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        need_capacity = int((num_of_clones + 2) * 1.35)
        # Test will run only on system with enough capacity
        if capacity_to_use < need_capacity:
            err_msg = (f"The system have only {ceph_capacity} GiB, "
                       f"we want to use only {capacity_to_use} GiB, "
                       f"and we need {need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        pvc_size = int(capacity_to_use / (num_of_clones + 2))

        self.interface = interface_iterate
        self.sc_obj = storageclass_factory(self.interface)

        if self.interface == constants.CEPHFILESYSTEM:
            sc = "CephFS"
        if self.interface == constants.CEPHBLOCKPOOL:
            sc = "RBD"

        self.full_log_path = get_full_test_logs_path(cname=self)
        self.full_log_path += f"-{sc}"

        self.pvc_obj = pvc_factory(interface=self.interface,
                                   size=pvc_size,
                                   status=constants.STATUS_BOUND)

        self.pod_obj = pod_factory(interface=self.interface,
                                   pvc=self.pvc_obj,
                                   status=constants.STATUS_RUNNING)

        # Calculating the file size as 70% of the PVC size
        filesize = self.pvc_obj.size * 0.70
        # Change the file size to MB for the FIO function
        file_size = f"{int(filesize * constants.GB2MB)}M"
        file_name = self.pod_obj.name

        log.info(f"Total capacity size is : {ceph_capacity} GiB, "
                 f"Going to use {need_capacity} GiB, "
                 f"With {num_of_clones} clones to {pvc_size} GiB PVC. "
                 f"File size to be written is : {file_size} "
                 f"with the name of {file_name}")
        self.params = {}
        self.params["clonenum"] = f"{num_of_clones}"
        self.params["filesize"] = file_size
        self.params["ERRMSG"] = "Error in command"

        clone_yaml = self.build_params()
        performance_lib.write_fio_on_pod(self.pod_obj, file_size)

        # Running the test
        results = []
        for test_num in range(1, int(self.params["clonenum"]) + 1):
            log.info(f"Starting test number {test_num}")
            ct = self.create_clone(test_num, clone_yaml)
            speed = self.params["datasize"] / ct
            results.append({"Clone Num": test_num, "time": ct, "speed": speed})
            log.info(
                f"Results for clone number {test_num} are : "
                f"Creation time is {ct} secs, Creation speed {speed} MB/sec")

        for r in results:
            log.info(
                f"Clone number {r['Clone Num']} creation time is {r['time']} secs."
            )
            log.info(
                f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec."
            )

        creation_time_list = [r["time"] for r in results]
        average_creation_time = statistics.mean(creation_time_list)
        log.info(f"Average creation time is  {average_creation_time} secs.")

        creation_speed_list = [r["speed"] for r in results]
        average_creation_speed = statistics.mean(creation_speed_list)
        log.info(f"Average creation speed is  {average_creation_time} MB/sec.")

        self.results_path = get_full_test_logs_path(cname=self)
        # Produce ES report
        # Collecting environment information
        self.get_env_info()

        # Initialize the results doc file.
        full_results = self.init_full_results(
            ResultsAnalyse(
                self.uuid,
                self.crd_data,
                self.full_log_path,
                "pvc_multiple_clone_measurement",
            ))

        full_results.add_key("interface", self.interface)
        full_results.add_key("clones_num", num_of_clones)
        full_results.add_key("clone_size", pvc_size)
        full_results.add_key("multi_clone_creation_time", creation_time_list)
        full_results.add_key("multi_clone_creation_time_average",
                             average_creation_time)
        full_results.add_key("multi_clone_creation_speed", creation_speed_list)
        full_results.add_key("multi_clone_creation_speed_average",
                             average_creation_speed)

        # Write the test results into the ES server
        if full_results.es_write():
            res_link = full_results.results_link()
            log.info(f"The Result can be found at : {res_link}")

            # Create text file with results of all subtest (4 - according to the parameters)
            self.write_result_to_file(res_link)
    def test_pvc_multiple_clone_performance(
        self,
        interface_iterate,
        teardown_factory,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        1. Creating PVC
           PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB
           it will use ~75% capacity of the Storage, Min storage capacity 1 TiB
        2. Fill the PVC with 70% of data
        3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation
            times from relevant logs
        4. Repeat the previous step number of times (maximal num_of_clones is 512)
        5. Print all measured statistics for all the clones.

        Raises:
            StorageNotSufficientException: in case of not enough capacity on the cluster

        """
        num_of_clones = 512

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = int(ceph_cluster.get_ceph_capacity())

        # Use 70% of the storage capacity in the test
        capacity_to_use = int(ceph_capacity * 0.7)

        # since we do not want to use more then 65%, we add 35% to the needed
        # capacity, and minimum PVC size is 1 GiB
        need_capacity = int((num_of_clones + 2) * 1.35)
        # Test will run only on system with enough capacity
        if capacity_to_use < need_capacity:
            err_msg = (f"The system have only {ceph_capacity} GiB, "
                       f"we want to use only {capacity_to_use} GiB, "
                       f"and we need {need_capacity} GiB to run the test")
            log.error(err_msg)
            raise exceptions.StorageNotSufficientException(err_msg)

        # Calculating the PVC size in GiB
        pvc_size = int(capacity_to_use / (num_of_clones + 2))

        self.interface = interface_iterate
        self.sc_obj = storageclass_factory(self.interface)

        self.pvc_obj = pvc_factory(interface=self.interface,
                                   size=pvc_size,
                                   status=constants.STATUS_BOUND)

        self.pod_obj = pod_factory(interface=self.interface,
                                   pvc=self.pvc_obj,
                                   status=constants.STATUS_RUNNING)

        # Calculating the file size as 70% of the PVC size
        filesize = self.pvc_obj.size * 0.70
        # Change the file size to MB for the FIO function
        file_size = f"{int(filesize * constants.GB2MB)}M"
        file_name = self.pod_obj.name

        log.info(f"Total capacity size is : {ceph_capacity} GiB, "
                 f"Going to use {need_capacity} GiB, "
                 f"With {num_of_clones} clones to {pvc_size} GiB PVC. "
                 f"File size to be written is : {file_size} "
                 f"with the name of {file_name}")
        self.params = {}
        self.params["clonenum"] = f"{num_of_clones}"
        self.params["filesize"] = file_size
        self.params["ERRMSG"] = "Error in command"

        clone_yaml = self.build_params()
        performance_lib.write_fio_on_pod(self.pod_obj, file_size)

        # Running the test
        results = []
        for test_num in range(1, int(self.params["clonenum"]) + 1):
            log.info(f"Starting test number {test_num}")
            ct = self.create_clone(test_num, clone_yaml)
            speed = self.params["datasize"] / ct
            results.append({"Clone Num": test_num, "time": ct, "speed": speed})
            log.info(
                f"Results for clone number {test_num} are : "
                f"Creation time is {ct} secs, Creation speed {speed} MB/sec")

        for r in results:
            log.info(
                f"Clone number {r['Clone Num']} creation time is {r['time']} secs."
            )
            log.info(
                f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec."
            )
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test

        """

        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml')
        if interface == 'CephBlockPool':
            sc = constants.CEPHBLOCKPOOL_SC
        else:
            sc = constants.CEPHFILESYSTEM_SC

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        if 'elasticsearch' in fio_cr['spec']:
            backup_es = fio_cr['spec']['elasticsearch']
        else:
            log.warning(
                'Elastic Search information does not exists in YAML file')
            fio_cr['spec']['elasticsearch'] = {}

        # Use the internal define elastic-search server in the test - if exist
        if es:
            fio_cr['spec']['elasticsearch'] = {
                'server': es.get_ip(),
                'port': es.get_port()
            }

        # Setting the data set to 40% of the total storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        total_data_set = int(ceph_capacity * 0.4)
        filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace(
            'GiB', ''))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_capacity * 0.008)
            fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB'
            # make sure that the storage size is larger then the file size
            fio_cr['spec']['workload']['args'][
                'storagesize'] = f'{int(filesize * 1.2)}Gi'
        fio_cr['spec']['workload']['args']['servers'] = int(total_data_set /
                                                            filesize)
        log.info(f'Total Data set to work on is : {total_data_set} GiB')

        environment = get_environment_info()
        if not environment['user'] == '':
            fio_cr['spec']['test_user'] = environment['user']
        fio_cr['spec']['clustername'] = environment['clustername']

        log.debug(f'Environment information is : {environment}')

        fio_cr['spec']['workload']['args']['storageclass'] = sc
        if io_pattern == 'sequential':
            fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read']
            fio_cr['spec']['workload']['args']['iodepth'] = 1
        log.info(f'The FIO CR file is {fio_cr}')
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      'fio-client',
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Getting the start time of the test
        start_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime())

        # Getting the UUID from inside the benchmark pod
        uuid = ripsaw.get_uuid(fio_client_pod)
        # Setting back the original elastic-search information
        fio_cr['spec']['elasticsearch'] = backup_es

        full_results = FIOResultsAnalyse(uuid, fio_cr)

        # Initialize the results doc file.
        for key in environment:
            full_results.add_key(key, environment[key])

        # Setting the global parameters of the test
        full_results.add_key('io_pattern', io_pattern)
        full_results.add_key('dataset', f'{total_data_set}GiB')
        full_results.add_key('file_size',
                             fio_cr['spec']['workload']['args']['filesize'])
        full_results.add_key('servers',
                             fio_cr['spec']['workload']['args']['servers'])
        full_results.add_key('samples',
                             fio_cr['spec']['workload']['args']['samples'])
        full_results.add_key('operations',
                             fio_cr['spec']['workload']['args']['jobs'])
        full_results.add_key('block_sizes',
                             fio_cr['spec']['workload']['args']['bs'])
        full_results.add_key('io_depth',
                             fio_cr['spec']['workload']['args']['iodepth'])
        full_results.add_key('jobs',
                             fio_cr['spec']['workload']['args']['numjobs'])
        full_results.add_key(
            'runtime', {
                'read': fio_cr['spec']['workload']['args']['read_runtime'],
                'write': fio_cr['spec']['workload']['args']['write_runtime']
            })
        full_results.add_key(
            'storageclass', fio_cr['spec']['workload']['args']['storageclass'])
        full_results.add_key('vol_size',
                             fio_cr['spec']['workload']['args']['storagesize'])

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        # Getting the end time of the test
        end_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime())
        full_results.add_key('test_time', {
            'start': start_time,
            'end': end_time
        })

        output = run_cmd(f'oc logs {fio_client_pod}')
        log.info(f'The Test log is : {output}')

        try:
            if 'Fio failed to execute' not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        log.debug(f'Full results is : {full_results.results}')

        # if Internal ES is exists, Copy all data from the Internal to main ES
        if es:
            log.info('Copy all data from Internal ES to Main ES')
            es._copy(full_results.es)
        # Adding this sleep between the copy and the analyzing of the results
        # since sometimes the results of the read (just after write) are empty
        time.sleep(30)
        full_results.analyze_results()  # Analyze the results
        # Writing the analyzed test results to the Elastic-Search server
        full_results.es_write()
        full_results.codespeed_push()  # Push results to codespeed
        # Creating full link to the results on the ES server
        log.info(f'The Result can be found at ; {full_results.results_link()}')
    def test_pvc_snapshot_performance(self, pvc_size):
        """
        1. Run I/O on a pod file
        2. Calculate md5sum of the file
        3. Take a snapshot of the PVC
        4. Measure the total snapshot creation time and the CSI snapshot creation time
        4. Restore From the snapshot and measure the time
        5. Attach a new pod to it
        6. Verify that the file is present on the new pod also
        7. Verify that the md5sum of the file on the new pod matches
           with the md5sum of the file on the original pod

        This scenario run 3 times and report all the average results of the 3 runs
        and will send them to the ES
        Args:
            pvc_size: the size of the PVC to be tested - parametrize

        """

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()

        log.info(f"Total capacity size is : {ceph_capacity}")
        log.info(f"PVC Size is : {pvc_size}")
        log.info(f"Needed capacity is {int(int(pvc_size) * 5)}")
        if int(ceph_capacity) < int(pvc_size) * 5:
            log.error(
                f"PVC size is {pvc_size}GiB and it is too large for this system"
                f" which have only {ceph_capacity}GiB")
            return
        # Calculating the file size as 25% of the PVC size
        # in the end the PVC will be 75% full
        filesize = self.pvc_obj.size * 0.25
        # Change the file size to MB and from int to str
        file_size = f"{int(filesize * 1024)}M"

        all_results = []

        self.results_path = get_full_test_logs_path(cname=self)
        log.info(f"Logs file path name is : {self.full_log_path}")

        # Produce ES report
        # Collecting environment information
        self.get_env_info()

        # Initialize the results doc file.
        self.full_results = self.init_full_results(
            ResultsAnalyse(
                self.uuid,
                self.crd_data,
                self.full_log_path,
                "pvc_snapshot_perf",
            ))
        self.full_results.add_key("pvc_size", pvc_size + " GiB")
        self.full_results.add_key("interface", self.sc)
        self.full_results.all_results["creation_time"] = []
        self.full_results.all_results["csi_creation_time"] = []
        self.full_results.all_results["creation_speed"] = []
        self.full_results.all_results["restore_time"] = []
        self.full_results.all_results["restore_speed"] = []
        self.full_results.all_results["restore_csi_time"] = []
        for test_num in range(self.tests_numbers):
            test_results = {
                "test_num": test_num + 1,
                "dataset": (test_num + 1) * filesize * 1024,  # size in MiB
                "create": {
                    "time": None,
                    "csi_time": None,
                    "speed": None
                },
                "restore": {
                    "time": None,
                    "speed": None
                },
            }
            log.info(f"Starting test phase number {test_num}")
            # Step 1. Run I/O on a pod file.
            file_name = f"{self.pod_object.name}-{test_num}"
            log.info(f"Starting IO on the POD {self.pod_object.name}")
            # Going to run only write IO to fill the PVC for the snapshot
            self.pod_object.fillup_fs(size=file_size, fio_filename=file_name)

            # Wait for fio to finish
            fio_result = self.pod_object.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"IO error on pod {self.pod_object.name}. FIO result: {fio_result}"
            log.info("IO on the PVC Finished")

            # Verify presence of the file
            file_path = pod.get_file_path(self.pod_object, file_name)
            log.info(f"Actual file path on the pod {file_path}")
            assert pod.check_file_existence(
                self.pod_object, file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {self.pod_object.name}")

            # Step 2. Calculate md5sum of the file.
            orig_md5_sum = pod.cal_md5sum(self.pod_object, file_name)

            # Step 3. Take a snapshot of the PVC and measure the time of creation.
            snap_name = self.pvc_obj.name.replace("pvc-test",
                                                  f"snapshot-test{test_num}")
            log.info(f"Taking snapshot of the PVC {snap_name}")

            start_time = datetime.datetime.utcnow().strftime(
                "%Y-%m-%dT%H:%M:%SZ")

            test_results["create"]["time"] = self.measure_create_snapshot_time(
                pvc_name=self.pvc_obj.name,
                snap_name=snap_name,
                namespace=self.pod_object.namespace,
                interface=self.interface,
                start_time=start_time,
            )

            test_results["create"][
                "csi_time"] = performance_lib.measure_csi_snapshot_creation_time(
                    interface=self.interface,
                    snapshot_id=self.snap_uid,
                    start_time=start_time,
                )

            test_results["create"]["speed"] = int(
                test_results["dataset"] / test_results["create"]["time"])
            log.info(
                f' Test {test_num} dataset is {test_results["dataset"]} MiB')
            log.info(
                f"Snapshot name {snap_name} and id {self.snap_uid} creation time is"
                f' : {test_results["create"]["time"]} sec.')
            log.info(
                f"Snapshot name {snap_name} and id {self.snap_uid} csi creation time is"
                f' : {test_results["create"]["csi_time"]} sec.')
            log.info(
                f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec'
            )

            # Step 4. Restore the PVC from the snapshot and measure the time
            # Same Storage class of the original PVC
            sc_name = self.pvc_obj.backed_sc

            # Size should be same as of the original PVC
            pvc_size = str(self.pvc_obj.size) + "Gi"

            # Create pvc out of the snapshot
            # Both, the snapshot and the restore PVC should be in same namespace

            log.info("Restoring from the Snapshot")
            restore_pvc_name = self.pvc_obj.name.replace(
                "pvc-test", f"restore-pvc{test_num}")
            restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML
            if self.interface == constants.CEPHFILESYSTEM:
                restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML

            csi_start_time = self.get_time("csi")
            log.info("Restoring the PVC from Snapshot")
            restore_pvc_obj = pvc.create_restore_pvc(
                sc_name=sc_name,
                snap_name=self.snap_obj.name,
                namespace=self.snap_obj.namespace,
                size=pvc_size,
                pvc_name=restore_pvc_name,
                restore_pvc_yaml=restore_pvc_yaml,
            )
            helpers.wait_for_resource_state(
                restore_pvc_obj,
                constants.STATUS_BOUND,
                timeout=3600  # setting this to 60 Min.
                # since it can be take long time to restore, and we want it to finished.
            )
            restore_pvc_obj.reload()
            log.info("PVC was restored from the snapshot")
            test_results["restore"][
                "time"] = helpers.measure_pvc_creation_time(
                    self.interface, restore_pvc_obj.name)

            test_results["restore"]["speed"] = int(
                test_results["dataset"] / test_results["restore"]["time"])
            log.info(
                f'Snapshot restore time is : {test_results["restore"]["time"]}'
            )
            log.info(
                f'restore speed is : {test_results["restore"]["speed"]} MB/sec'
            )

            test_results["restore"][
                "csi_time"] = performance_lib.csi_pvc_time_measure(
                    self.interface, restore_pvc_obj, "create", csi_start_time)
            log.info(
                f'Snapshot csi restore time is : {test_results["restore"]["csi_time"]}'
            )

            # Step 5. Attach a new pod to the restored PVC
            restore_pod_object = helpers.create_pod(
                interface_type=self.interface,
                pvc_name=restore_pvc_obj.name,
                namespace=self.snap_obj.namespace,
            )

            # Confirm that the pod is running
            helpers.wait_for_resource_state(resource=restore_pod_object,
                                            state=constants.STATUS_RUNNING)
            restore_pod_object.reload()

            # Step 6. Verify that the file is present on the new pod also.
            log.info(f"Checking the existence of {file_name} "
                     f"on restore pod {restore_pod_object.name}")
            assert pod.check_file_existence(
                restore_pod_object,
                file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {restore_pod_object.name}")

            # Step 7. Verify that the md5sum matches
            log.info(
                f"Verifying that md5sum of {file_name} "
                f"on pod {self.pod_object.name} matches with md5sum "
                f"of the same file on restore pod {restore_pod_object.name}")
            assert pod.verify_data_integrity(
                restore_pod_object, file_name,
                orig_md5_sum), "Data integrity check failed"
            log.info("Data integrity check passed, md5sum are same")

            restore_pod_object.delete()
            restore_pvc_obj.delete()

            all_results.append(test_results)

        # clean the enviroment
        self.pod_object.delete()
        self.pvc_obj.delete()
        self.delete_test_project()

        # logging the test summary, all info in one place for easy log reading
        c_speed, c_runtime, c_csi_runtime, r_speed, r_runtime, r_csi_runtime = (
            0 for i in range(6))

        log.info("Test summary :")
        for tst in all_results:
            c_speed += tst["create"]["speed"]
            c_runtime += tst["create"]["time"]
            c_csi_runtime += tst["create"]["csi_time"]
            r_speed += tst["restore"]["speed"]
            r_runtime += tst["restore"]["time"]
            r_csi_runtime += tst["restore"]["csi_time"]

            self.full_results.all_results["creation_time"].append(
                tst["create"]["time"])
            self.full_results.all_results["csi_creation_time"].append(
                tst["create"]["csi_time"])
            self.full_results.all_results["creation_speed"].append(
                tst["create"]["speed"])
            self.full_results.all_results["restore_time"].append(
                tst["restore"]["time"])
            self.full_results.all_results["restore_speed"].append(
                tst["restore"]["speed"])
            self.full_results.all_results["restore_csi_time"].append(
                tst["restore"]["csi_time"])
            self.full_results.all_results["dataset_inMiB"] = tst["dataset"]
            log.info(
                f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. "
                f"Take snapshot time is {tst['create']['time']} "
                f"at {tst['create']['speed']} MiB/Sec "
                f"Restore from snapshot time is {tst['restore']['time']} "
                f"at {tst['restore']['speed']} MiB/Sec ")

        avg_snap_c_time = c_runtime / self.tests_numbers
        avg_snap_csi_c_time = c_csi_runtime / self.tests_numbers
        avg_snap_c_speed = c_speed / self.tests_numbers
        avg_snap_r_time = r_runtime / self.tests_numbers
        avg_snap_r_speed = r_speed / self.tests_numbers
        avg_snap_r_csi_time = r_csi_runtime / self.tests_numbers
        log.info(f" Average snapshot creation time is {avg_snap_c_time} sec.")
        log.info(
            f" Average csi snapshot creation time is {avg_snap_csi_c_time} sec."
        )
        log.info(
            f" Average snapshot creation speed is {avg_snap_c_speed} MiB/sec")
        log.info(f" Average snapshot restore time is {avg_snap_r_time} sec.")
        log.info(
            f" Average snapshot restore speed is {avg_snap_r_speed} MiB/sec")
        log.info(
            f" Average snapshot restore csi time is {avg_snap_r_csi_time} sec."
        )

        self.full_results.add_key("avg_snap_creation_time_insecs",
                                  avg_snap_c_time)
        self.full_results.add_key("avg_snap_csi_creation_time_insecs",
                                  avg_snap_csi_c_time)
        self.full_results.add_key("avg_snap_creation_speed", avg_snap_c_speed)
        self.full_results.add_key("avg_snap_restore_time_insecs",
                                  avg_snap_r_time)
        self.full_results.add_key("avg_snap_restore_speed", avg_snap_r_speed)
        self.full_results.add_key("avg_snap_restore_csi_time_insecs",
                                  avg_snap_r_csi_time)

        # Write the test results into the ES server
        log.info("writing results to elastic search server")
        if self.full_results.es_write():
            res_link = self.full_results.results_link()

            # write the ES link to the test results in the test log.
            log.info(f"The result can be found at : {res_link}")

            self.write_result_to_file(res_link)
class TestFullClusterHealth(PASTest):
    """
    Test Cluster health when storage is ~85%
    """

    @pytest.fixture(autouse=True)
    def setup(self, request, nodes):
        """
        Setting up test parameters
        """

        def teardown():
            logger.info("cleanup the environment")
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(teardown)

        logger.info("Starting the test setup")
        self.percent_to_fill = 85.0
        self.ceph_cluster = CephCluster()
        self.nodes = None

        self.benchmark_name = "FIO"
        self.client_pod_name = "fio-client"

        self.sanity_helpers = sanity_helpers.Sanity()

        super(TestFullClusterHealth, self).setup()
        # deploy the benchmark-operator
        self.deploy_benchmark_operator()

    def run(self):
        """

        Run the test, and wait until it finished
        """

        self.deploy_and_wait_for_wl_to_start(timeout=900)
        self.wait_for_wl_to_finish(sleep=300)

        try:
            if "Fio failed to execute" not in self.test_logs:
                logger.info("FIO has completed successfully")
        except IOError:
            logger.warning("FIO failed to complete")

    def calculate_crd_data(self):
        """
        Getting the storage capacity and calculate pod count and pvc size

        """

        ceph_used_capacity_percent = get_percent_used_capacity()
        logger.info(f"Ceph used capacity percent is {ceph_used_capacity_percent}%")

        ceph_capacity = self.ceph_cluster.get_ceph_capacity()
        logger.info(f"Total storage capacity is {ceph_capacity} GiB")

        self.percent_to_fill = self.percent_to_fill - ceph_used_capacity_percent
        logger.info(f"Percentage to fill is {self.percent_to_fill}%")

        self.total_data_set = int(ceph_capacity * (int(self.percent_to_fill) / 100))
        self.filesize = int(
            self.crd_data["spec"]["workload"]["args"]["filesize"].replace("GiB", "")
        )

        # Make sure that filesize>=10 and servers<=60
        self.servers = 60
        self.filesize = int(self.total_data_set / self.servers)
        if self.filesize < 10:
            self.filesize = 10
            self.servers = int(self.total_data_set / self.filesize)

        self.crd_data["spec"]["workload"]["args"]["filesize"] = f"{self.filesize}GiB"
        self.crd_data["spec"]["workload"]["args"][
            "storagesize"
        ] = f"{int(self.total_data_set)}Gi"
        self.crd_data["spec"]["workload"]["args"]["servers"] = self.servers
        self.crd_data["spec"]["workload"]["args"]["bs"] = "1024KiB"
        self.crd_data["spec"]["workload"]["args"]["jobs"] = ["write", "read"]
        self.crd_data["spec"]["workload"]["args"]["iodepth"] = 1

    def delete_pods(self):
        """
        Try to delete pods:
            - Rook operator
            - OSD
            - MGR
            - MON
        """
        pod_list = []
        rook_operator_pod = pod.get_ocs_operator_pod(
            ocs_label=constants.OPERATOR_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        pod_list.append(rook_operator_pod)

        osd_pods = pod.get_osd_pods()
        pod_list.extend(osd_pods)

        mgr_pods = pod.get_mgr_pods()
        pod_list.extend(mgr_pods)

        mon_pods = pod.get_mon_pods()
        pod_list.extend(mon_pods)

        logger.info(f"Deleting pods: {[p.name for p in pod_list]}")
        pod.delete_pods(pod_objs=pod_list)

    def ceph_not_health_error(self):
        """
        Check if Ceph is NOT in "HEALTH_ERR" state
        Warning state is ok since the cluster is low in storage space

        Returns:
            bool: True if Ceph state is NOT "HEALTH_ERR"
        """
        ceph_status = self.ceph_cluster.get_ceph_health()
        logger.info(f"Ceph status is: {ceph_status}")
        return ceph_status != "HEALTH_ERR"

    def mgr_pod_node_restart(self):
        """
        Restart node that runs mgr pod
        """
        mgr_pod_obj = pod.get_mgr_pods()
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        self.nodes.restart_nodes([mgr_node_obj])

        wait_for_nodes_status()

        # Check for Ceph pods
        pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        assert pod_obj.wait_for_resource(
            condition="Running", selector="app=rook-ceph-mgr", timeout=600
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-mon",
            resource_count=3,
            timeout=600,
        )
        assert pod_obj.wait_for_resource(
            condition="Running",
            selector="app=rook-ceph-osd",
            resource_count=3,
            timeout=600,
        )

    def restart_ocs_operator_node(self):
        """
        Restart node that runs OCS operator pod
        """

        pod_obj = pod.get_ocs_operator_pod()
        node_obj = pod.get_pod_node(pod_obj)

        self.nodes.restart_nodes([node_obj])

        wait_for_nodes_status()

        pod.wait_for_pods_to_be_running(
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name]
        )

    def is_cluster_healthy(self):
        """
        Wrapper function for cluster health check

        Returns:
            bool: True if ALL checks passed, False otherwise
        """
        return self.ceph_not_health_error() and pod.wait_for_pods_to_be_running()

    @system_test
    @polarion_id("OCS-2749")
    def test_full_cluster_health(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Verify that the cluster health is ok when the storage is ~85% full

        Steps:
          1. Deploy benchmark operator and run fio workload
          2. Check Ceph health before/after each operation:
            2.1 Osd node reboot
            2.2 Mgr node reboot
            2.3 OCS operator node reboot
            2.4 Delete Rook, OSD, MGR & MON pods
            2.5 Creation and deletion of resources

        """
        self.nodes = nodes

        self.full_log_path = get_full_test_logs_path(cname=self)
        logger.info(f"Logs file path name is : {self.full_log_path}")

        logger.info("Create resource file for fio workload")
        self.crd_data = templating.load_yaml(constants.FIO_CR_YAML)
        self.calculate_crd_data()

        self.set_storageclass(interface=constants.CEPHBLOCKPOOL)

        self.run()

        logger.info("Checking health before disruptive operations")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        osd_node_reboot()
        logger.info("Checking health after OSD node reboot")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        self.mgr_pod_node_restart()
        logger.info("Checking health after worker node shutdown")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        self.restart_ocs_operator_node()
        logger.info("Checking health after OCS operator node restart")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        self.delete_pods()
        logger.info("Checking health after Rook, OSD, MGR & MON pods deletion")
        assert self.is_cluster_healthy(), "Cluster is not healthy"

        # Create resources
        logger.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        logger.info("Resources Created")

        # Delete resources
        logger.info("Deleting resources")
        self.sanity_helpers.delete_resources()
        logger.info("Resources Deleted")

        logger.info(
            "Checking health after resources creation and deletion using sanity helpers"
        )
        assert self.is_cluster_healthy(), "Cluster is not healthy"
    def test_pvc_snapshot_performance(self, teardown_factory, pvc_size):
        """
        1. Run I/O on a pod file.
        2. Calculate md5sum of the file.
        3. Take a snapshot of the PVC and measure the time of creation.
        4. Restore From the snapshot and measure the time
        5. Attach a new pod to it.
        6. Verify that the file is present on the new pod also.
        7. Verify that the md5sum of the file on the new pod matches
           with the md5sum of the file on the original pod.

        This scenario run 3 times and report all results
        Args:
            teardown_factory: A fixture to destroy objects
            pvc_size: the size of the PVC to be tested - parametrize

        """

        # Getting the total Storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()

        log.info(f"Total capacity size is : {ceph_capacity}")
        log.info(f"PVC Size is : {pvc_size}")
        log.info(f"Needed capacity is {int(int(pvc_size) * 5)}")
        if int(ceph_capacity) < int(pvc_size) * 5:
            log.error(
                f"PVC size is {pvc_size}GiB and it is too large for this system"
                f" which have only {ceph_capacity}GiB")
            return
        # Calculating the file size as 25% of the PVC size
        # in the end the PVC will be 75% full
        filesize = self.pvc_obj.size * 0.25
        # Change the file size to MB and from int to str
        file_size = f"{int(filesize * 1024)}M"

        all_results = []

        for test_num in range(self.tests_numbers):
            test_results = {
                "test_num": test_num + 1,
                "dataset": (test_num + 1) * filesize * 1024,  # size in MiB
                "create": {
                    "time": None,
                    "speed": None
                },
                "restore": {
                    "time": None,
                    "speed": None
                },
            }
            log.info(f"Starting test phase number {test_num}")
            # Step 1. Run I/O on a pod file.
            file_name = f"{self.pod_obj.name}-{test_num}"
            log.info(f"Starting IO on the POD {self.pod_obj.name}")
            # Going to run only write IO to fill the PVC for the snapshot
            self.pod_obj.fillup_fs(size=file_size, fio_filename=file_name)

            # Wait for fio to finish
            fio_result = self.pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"IO error on pod {self.pod_obj.name}. FIO result: {fio_result}"
            log.info("IO on the PVC Finished")

            # Verify presence of the file
            file_path = pod.get_file_path(self.pod_obj, file_name)
            log.info(f"Actual file path on the pod {file_path}")
            assert pod.check_file_existence(
                self.pod_obj, file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {self.pod_obj.name}")

            # Step 2. Calculate md5sum of the file.
            orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name)

            # Step 3. Take a snapshot of the PVC and measure the time of creation.
            snap_name = self.pvc_obj.name.replace("pvc-test",
                                                  f"snapshot-test{test_num}")
            log.info(f"Taking snapshot of the PVC {snap_name}")

            test_results["create"]["time"] = self.measure_create_snapshot_time(
                pvc_name=self.pvc_obj.name,
                snap_name=snap_name,
                interface=self.interface,
            )
            test_results["create"]["speed"] = int(
                test_results["dataset"] / test_results["create"]["time"])
            log.info(
                f' Test {test_num} dataset is {test_results["dataset"]} MiB')
            log.info(
                f'Snapshot creation time is : {test_results["create"]["time"]} sec.'
            )
            log.info(
                f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec'
            )

            # Step 4. Restore the PVC from the snapshot and measure the time
            # Same Storage class of the original PVC
            sc_name = self.pvc_obj.backed_sc

            # Size should be same as of the original PVC
            pvc_size = str(self.pvc_obj.size) + "Gi"

            # Create pvc out of the snapshot
            # Both, the snapshot and the restore PVC should be in same namespace

            log.info("Restoring from the Snapshot")
            restore_pvc_name = self.pvc_obj.name.replace(
                "pvc-test", f"restore-pvc{test_num}")
            restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML
            if self.interface == constants.CEPHFILESYSTEM:
                restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML

            log.info("Resorting the PVC from Snapshot")
            restore_pvc_obj = pvc.create_restore_pvc(
                sc_name=sc_name,
                snap_name=self.snap_obj.name,
                namespace=self.snap_obj.namespace,
                size=pvc_size,
                pvc_name=restore_pvc_name,
                restore_pvc_yaml=restore_pvc_yaml,
            )
            helpers.wait_for_resource_state(
                restore_pvc_obj,
                constants.STATUS_BOUND,
                timeout=3600  # setting this to 60 Min.
                # since it can be take long time to restore, and we want it to finished.
            )
            teardown_factory(restore_pvc_obj)
            restore_pvc_obj.reload()
            log.info("PVC was restored from the snapshot")
            test_results["restore"][
                "time"] = helpers.measure_pvc_creation_time(
                    self.interface, restore_pvc_obj.name)
            test_results["restore"]["speed"] = int(
                test_results["dataset"] / test_results["restore"]["time"])
            log.info(
                f'Snapshot restore time is : {test_results["restore"]["time"]}'
            )
            log.info(
                f'restore sped is : {test_results["restore"]["speed"]} MB/sec')

            # Step 5. Attach a new pod to the restored PVC
            restore_pod_obj = helpers.create_pod(
                interface_type=self.interface,
                pvc_name=restore_pvc_obj.name,
                namespace=self.snap_obj.namespace,
                pod_dict_path=constants.NGINX_POD_YAML,
            )

            # Confirm that the pod is running
            helpers.wait_for_resource_state(resource=restore_pod_obj,
                                            state=constants.STATUS_RUNNING)
            teardown_factory(restore_pod_obj)
            restore_pod_obj.reload()

            # Step 6. Verify that the file is present on the new pod also.
            log.info(f"Checking the existence of {file_name} "
                     f"on restore pod {restore_pod_obj.name}")
            assert pod.check_file_existence(
                restore_pod_obj, file_path), f"File {file_name} doesn't exist"
            log.info(f"File {file_name} exists in {restore_pod_obj.name}")

            # Step 7. Verify that the md5sum matches
            log.info(f"Verifying that md5sum of {file_name} "
                     f"on pod {self.pod_obj.name} matches with md5sum "
                     f"of the same file on restore pod {restore_pod_obj.name}")
            assert pod.verify_data_integrity(
                restore_pod_obj, file_name,
                orig_md5_sum), "Data integrity check failed"
            log.info("Data integrity check passed, md5sum are same")

            all_results.append(test_results)

        # logging the test summery, all info in one place for easy log reading
        c_speed, c_runtime, r_speed, r_runtime = (0 for i in range(4))
        log.info("Test summery :")
        for tst in all_results:
            c_speed += tst["create"]["speed"]
            c_runtime += tst["create"]["time"]
            r_speed += tst["restore"]["speed"]
            r_runtime += tst["restore"]["time"]
            log.info(
                f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. "
                f"Take snapshot time is {tst['create']['time']} "
                f"at {tst['create']['speed']} MiB/Sec "
                f"Restore from snapshot time is {tst['restore']['time']} "
                f"at {tst['restore']['speed']} MiB/Sec ")
        log.info(
            f" Average snapshot creation time is {c_runtime / self.tests_numbers} sec."
        )
        log.info(
            f" Average snapshot creation speed is {c_speed / self.tests_numbers} MiB/sec"
        )
        log.info(
            f" Average snapshot restore time is {r_runtime / self.tests_numbers} sec."
        )
        log.info(
            f" Average snapshot restore speed is {r_speed / self.tests_numbers} MiB/sec"
        )
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test

        """

        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml")
        if interface == "CephBlockPool":
            sc = constants.CEPHBLOCKPOOL_SC
        else:
            sc = constants.CEPHFILESYSTEM_SC

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        if "elasticsearch" in fio_cr["spec"]:
            backup_es = fio_cr["spec"]["elasticsearch"]
        else:
            log.warning(
                "Elastic Search information does not exists in YAML file")
            fio_cr["spec"]["elasticsearch"] = {}

        # Use the internal define elastic-search server in the test - if exist
        if es:
            fio_cr["spec"]["elasticsearch"] = {
                "server": es.get_ip(),
                "port": es.get_port(),
            }

        # Setting the data set to 40% of the total storage capacity
        ceph_cluster = CephCluster()
        ceph_capacity = ceph_cluster.get_ceph_capacity()
        total_data_set = int(ceph_capacity * 0.4)
        filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace(
            "GiB", ""))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_capacity * 0.008)
            fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB"
            # make sure that the storage size is larger then the file size
            fio_cr["spec"]["workload"]["args"][
                "storagesize"] = f"{int(filesize * 1.2)}Gi"
        fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set /
                                                            filesize)
        log.info(f"Total Data set to work on is : {total_data_set} GiB")

        environment = get_environment_info()
        if not environment["user"] == "":
            fio_cr["spec"]["test_user"] = environment["user"]
        fio_cr["spec"]["clustername"] = environment["clustername"]

        log.debug(f"Environment information is : {environment}")

        fio_cr["spec"]["workload"]["args"]["storageclass"] = sc
        if io_pattern == "sequential":
            fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"]
            fio_cr["spec"]["workload"]["args"]["iodepth"] = 1
        log.info(f"The FIO CR file is {fio_cr}")
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      "fio-client",
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Getting the start time of the test
        start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())

        # Getting the UUID from inside the benchmark pod
        uuid = ripsaw.get_uuid(fio_client_pod)
        # Setting back the original elastic-search information
        fio_cr["spec"]["elasticsearch"] = backup_es

        full_results = FIOResultsAnalyse(uuid, fio_cr)

        # Initialize the results doc file.
        for key in environment:
            full_results.add_key(key, environment[key])

        # Setting the global parameters of the test
        full_results.add_key("io_pattern", io_pattern)
        full_results.add_key("dataset", f"{total_data_set}GiB")
        full_results.add_key("file_size",
                             fio_cr["spec"]["workload"]["args"]["filesize"])
        full_results.add_key("servers",
                             fio_cr["spec"]["workload"]["args"]["servers"])
        full_results.add_key("samples",
                             fio_cr["spec"]["workload"]["args"]["samples"])
        full_results.add_key("operations",
                             fio_cr["spec"]["workload"]["args"]["jobs"])
        full_results.add_key("block_sizes",
                             fio_cr["spec"]["workload"]["args"]["bs"])
        full_results.add_key("io_depth",
                             fio_cr["spec"]["workload"]["args"]["iodepth"])
        full_results.add_key("jobs",
                             fio_cr["spec"]["workload"]["args"]["numjobs"])
        full_results.add_key(
            "runtime",
            {
                "read": fio_cr["spec"]["workload"]["args"]["read_runtime"],
                "write": fio_cr["spec"]["workload"]["args"]["write_runtime"],
            },
        )
        full_results.add_key(
            "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"])
        full_results.add_key("vol_size",
                             fio_cr["spec"]["workload"]["args"]["storagesize"])

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind="pod")
        pod_obj.wait_for_resource(
            condition="Completed",
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        # Getting the end time of the test
        end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime())
        full_results.add_key("test_time", {
            "start": start_time,
            "end": end_time
        })

        output = run_cmd(f"oc logs {fio_client_pod}")
        log.info(f"The Test log is : {output}")

        try:
            if "Fio failed to execute" not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        log.debug(f"Full results is : {full_results.results}")

        # if Internal ES is exists, Copy all data from the Internal to main ES
        if es:
            log.info("Copy all data from Internal ES to Main ES")
            es._copy(full_results.es)
        # Adding this sleep between the copy and the analyzing of the results
        # since sometimes the results of the read (just after write) are empty
        time.sleep(30)
        full_results.analyze_results()  # Analyze the results
        # Writing the analyzed test results to the Elastic-Search server
        full_results.es_write()
        full_results.codespeed_push()  # Push results to codespeed
        # Creating full link to the results on the ES server
        log.info(f"The Result can be found at ; {full_results.results_link()}")
class TestSmallFileWorkloadScale(E2ETest):
    """
    Deploy benchmark operator and run different scale tests.
    Call common small files workload routine to run SmallFile workload
    """

    def setup(self):
        """
        Initialize the test environment

        """
        # Deploy internal ES server - not need to keep results,
        # so don't use production ES
        self.es = ElasticSearch()

        # Initial the Small Files workload, based on benchmark-operator
        self.small_files = SmallFiles(self.es)

        self.ceph_cluster = CephCluster()

        # Get the total storage capacity
        self.ceph_capacity = self.ceph_cluster.get_ceph_capacity()
        log.info(f"Total storage capacity is {self.ceph_capacity:,.2f} GiB")

        # Collect the pulls usage before the test is starting
        self.orig_data = self.get_cephfs_data()

    def teardown(self):
        """
        Teardown the test environment

        """
        self.small_files.cleanup()
        self.es.cleanup()

    def get_cephfs_data(self):
        """
        Look through ceph pods and find space usage on all ceph filesystem pods

        Returns:
            Dictionary of byte usage, indexed by pod name.
        """
        ceph_status = self.ceph_cluster.toolbox.exec_ceph_cmd(ceph_cmd="ceph df")
        ret_value = {}
        for pool in ceph_status["pools"]:
            # Only the data pool is in our interest (not metadata)
            if "cephfilesystem" in pool["name"]:
                ret_value[pool["name"]] = pool["stats"]["bytes_used"]
        return ret_value

    def display_ceph_usage(self, msg, data):
        """
        Display the pool usage in a pretty way

        Args:
            msg (str): the message string to display with the values
            data (dict): dictionary of pools -> capacity (in bytes)

        """
        log.info(f"The pools usage {msg} is :")
        for entry in data:
            log.info(f"{entry} now uses {data[entry]:,} bytes")

    @pytest.mark.parametrize(
        argnames=["file_size", "files", "threads", "interface"],
        argvalues=[
            # 500K Files, ~4GB
            pytest.param(*[8, 125000, 4, constants.CEPHFILESYSTEM]),
            # 5M Files, ~152GB
            pytest.param(*[32, 1250000, 4, constants.CEPHFILESYSTEM]),
        ],
    )
    def test_scale_smallfile_workload(self, file_size, files, threads, interface):
        # updating the benchmark parameters
        self.small_files.setup_storageclass(interface)
        self.small_files.setup_test_params(file_size, files, threads, 1)

        # Verify we have enough storage capacity to run the test.
        self.small_files.setup_vol_size(file_size, files, threads, self.ceph_capacity)

        # Run the benchmark to create files on the volume
        self.small_files.setup_operations("create")
        self.small_files.run()

        # Collect pools usage after creation is done.
        self.run_data = self.get_cephfs_data()

        # Delete the benchmark data
        self.small_files.delete()

        # Getting the usage capacity immediately after deletion
        self.now_data = self.get_cephfs_data()

        # Wait 3 minutes for the backend deletion actually start.
        time.sleep(180)

        # Quarry the storage usage every 2 Min. if no difference between two
        # samples, the backend cleanup is done.
        still_going_down = True
        while still_going_down:
            log.info("Waiting for Ceph to finish cleaning up")
            time.sleep(120)
            self.new_data = self.get_cephfs_data()
            still_going_down = False
            for entry in self.new_data:
                if self.new_data[entry] < self.now_data[entry]:
                    still_going_down = True
                    self.now_data[entry] = self.new_data[entry]

        self.display_ceph_usage("Before ths test", self.orig_data)
        self.display_ceph_usage("After data creation", self.run_data)

        # Make sure that the test actually wrote data to the volume
        # at least 1GiB.
        for entry in self.run_data:
            if re.search("metadata", entry):
                # Since we are interesting in the data written and not the metadata
                # skipping the metadata pool
                continue
            written = self.run_data[entry] - self.orig_data[entry]
            check = written > constants.GB
            errmsg = (
                f"{written:,.2f} bytes was written to {entry} -"
                "This is not enough for the test"
            )
            assert check, errmsg

        self.display_ceph_usage("After data deletion", self.now_data)

        for entry in self.now_data:
            # Leak indicated if over %20 more storage is used and more then 5 GiB.
            try:
                ratio = self.now_data[entry] / self.orig_data[entry]
            except ZeroDivisionError:
                ratio = self.now_data[entry]

            added_data = (self.now_data[entry] - self.orig_data[entry]) / constants.GB
            # in some cases (especially for metadata), it might be that after the
            # test there is less data in the pool than before the test.
            if added_data < 0:
                added_data = 0
                ratio = 1

            log.info(
                "The ratio between capacity before and after the test "
                f"on {entry} is : {ratio:.2f} ; {added_data:,.2f} GiB"
            )

            check = (ratio < 1.20) or (added_data < 3)
            errmsg = f"{entry} is over 20% (or 3 GiB) larger [{ratio} ; {added_data}]-- possible leak"
            assert check, errmsg
Exemple #13
0
    def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern):
        """
        This is a basic fio perf test
        """
        # Deployment ripsaw
        log.info("Deploying ripsaw operator")
        ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml')
        sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs'

        # Create fio benchmark
        log.info("Create resource file for fio workload")
        fio_cr = templating.load_yaml(constants.FIO_CR_YAML)

        # Saving the Original elastic-search IP and PORT - if defined in yaml
        es_server = ""
        es_port = ""
        if 'elasticsearch' in fio_cr['spec']:
            if 'server' in fio_cr['spec']['elasticsearch']:
                es_server = fio_cr['spec']['elasticsearch']['server']
            if 'port' in fio_cr['spec']['elasticsearch']:
                es_port = fio_cr['spec']['elasticsearch']['port']
        else:
            fio_cr['spec']['elasticsearch'] = {}

        # Use the internal define elastic-search server in the test
        fio_cr['spec']['elasticsearch'] = {
            'server': es.get_ip(),
            'port': es.get_port()
        }

        # Setting the data set to 40% of the total storage capacity but
        # not more then 600GiB
        ceph_cluster = CephCluster()
        total_data_set = int(ceph_cluster.get_ceph_capacity() * 0.4)
        filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace(
            'GiB', ''))
        # To make sure the number of App pods will not be more then 50, in case
        # of large data set, changing the size of the file each pod will work on
        if total_data_set > 500:
            filesize = int(ceph_cluster.get_ceph_capacity() * 0.008)
            fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB'
            # make sure that the storage size is larger then the file size
            fio_cr['spec']['workload']['args'][
                'storagesize'] = f'{int(filesize * 1.2)}Gi'
        fio_cr['spec']['workload']['args']['servers'] = int(total_data_set /
                                                            filesize)
        log.info(f'Total Data set to work on is : {total_data_set} GiB')

        fio_cr['spec']['clustername'] = config.ENV_DATA[
            'platform'] + get_build() + get_ocs_version()
        fio_cr['spec']['test_user'] = get_ocs_version(
        ) + interface + io_pattern
        fio_cr['spec']['workload']['args']['storageclass'] = sc
        if io_pattern == 'sequential':
            fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read']
        log.info(f'fio_cr: {fio_cr}')
        fio_cr_obj = OCS(**fio_cr)
        fio_cr_obj.create()

        # Wait for fio client pod to be created
        for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern,
                                      'fio-client',
                                      constants.RIPSAW_NAMESPACE):
            try:
                if fio_pod[0] is not None:
                    fio_client_pod = fio_pod[0]
                    break
            except IndexError:
                log.info("Bench pod not ready yet")

        # Wait for fio pod to initialized and complete
        log.info("Waiting for fio_client to complete")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=fio_client_pod,
            timeout=18000,
            sleep=300,
        )

        output = run_cmd(f'oc logs {fio_client_pod}')

        try:
            if 'Fio failed to execute' not in output:
                log.info("FIO has completed successfully")
        except IOError:
            log.info("FIO failed to complete")

        # Clean up fio benchmark
        log.info("Deleting FIO benchmark")
        fio_cr_obj.delete()

        # Setting back the original elastic-search information
        fio_cr['spec']['elasticsearch'] = {
            'server': es_server,
            'port': es_port
        }
        analyze_regression(io_pattern,
                           sc,
                           es_username=fio_cr['spec']['test_user'])