def setting_storage_usage(self): """ Getting the storage capacity, calculate the usage of the storage and setting the workload CR rile parameters. """ ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total storage capacity is {ceph_capacity} GiB") self.total_data_set = int(ceph_capacity * 0.4) self.filesize = int( self.fio_cr["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if self.total_data_set > 500: self.filesize = int(ceph_capacity * 0.008) self.fio_cr["spec"]["workload"]["args"][ "filesize"] = f"{self.filesize}GiB" # make sure that the storage size is larger then the file size self.fio_cr["spec"]["workload"]["args"][ "storagesize"] = f"{int(self.filesize * 1.2)}Gi" self.fio_cr["spec"]["workload"]["args"]["servers"] = int( self.total_data_set / self.filesize) log.info(f"Total Data set to work on is : {self.total_data_set} GiB")
def setting_storage_usage(self): """ Getting the storage capacity, calculate the usage of the storage and setting the workload CR rile parameters. """ # for development mode - use parameters for short test run if self.dev_mode: log.info("Setting up parameters for development mode") self.crd_data["spec"]["workload"]["args"]["filesize"] = "1GiB" self.crd_data["spec"]["workload"]["args"]["storagesize"] = "5Gi" self.crd_data["spec"]["workload"]["args"]["servers"] = 2 self.crd_data["spec"]["workload"]["args"]["samples"] = 2 self.crd_data["spec"]["workload"]["args"]["read_runtime"] = 30 self.crd_data["spec"]["workload"]["args"]["write_runtime"] = 30 self.crd_data["spec"]["workload"]["args"]["bs"] = ["64KiB"] self.total_data_set = 20 self.filesize = 3 return ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total storage capacity is {ceph_capacity} GiB") self.total_data_set = int(ceph_capacity * 0.4) self.filesize = int( self.crd_data["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if self.total_data_set > 500: self.filesize = int(ceph_capacity * 0.008) self.crd_data["spec"]["workload"]["args"][ "filesize"] = f"{self.filesize}GiB" # make sure that the storage size is larger then the file size self.crd_data["spec"]["workload"]["args"][ "storagesize"] = f"{int(self.filesize * 1.2)}Gi" self.crd_data["spec"]["workload"]["args"]["servers"] = int( self.total_data_set / self.filesize) log.info(f"Total Data set to work on is : {self.total_data_set} GiB")
def test_vdbench_workload(self, template, with_ocs, load, label_nodes, ripsaw, servers, threads, blocksize, fileio, samples, width, depth, files, file_size, runtime, pause): """ Run VDBench Workload Args : template (str) : Name of yaml file that will used as a template with_ocs (bool) : This parameter will indicate if the test will run on the same nodes as the OCS load (int) : load to run on the storage in percentage of the capacity. label_nodes (fixture) : This fixture is labeling the worker(s) that will used for App. pod(s) ripsaw (fixture) : Fixture to deploy the ripsaw benchmarking operator servers (int) : Number of servers (pods) that will run the IO threads (int) : Number of threads that will run on each server blocksize (list - str): List of BlockSize - must add the 'K' to it fileio (str) : How to select file for the IO : random / sequential samples (int) : Number of time(s) to run each test width (int) : Width of directory tree to create depth (int) : Depth of directory tree to create files (int) : Number of files to create in each directory file_size (int) : File size (in MB) to create runtime (int) : Time (in Sec.) for each test iteration pause (int) : Time (in Min.) to pause between each test iteration. """ log.info(f'going to use {template} as template') log.info("Apply Operator CRD") crd = 'resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml' ripsaw.apply_crd(crd) log.info('Running vdbench benchmark') if template: template = os.path.join(constants.TEMPLATE_VDBENCH_DIR, template) else: template = constants.VDBENCH_BENCHMARK_YAML sf_data = templating.load_yaml(template) target_results = template + 'Results' log.info('Calculating Storage size....') ceph_cluster = CephCluster() total_capacity = ceph_cluster.get_ceph_capacity() assert total_capacity > constants.VDBENCH_MIN_CAPACITY, ( "Storage capacity is too low for performance testing") log.info(f'The Total usable capacity is {total_capacity}') if load: width = constants.VDBENCH_WIDTH depth = constants.VDBENCH_DEPTH file_size = constants.VDBENCH_FILE_SIZE capacity_per_pod = constants.VDBENCH_CAP_PER_POD total_dirs = width**depth log.info(f'The total dirs in the tree {total_dirs}') log.info(f'Going to run with {load} % of the capacity load.') tested_capacity = round(total_capacity * 1024 * load / 100) log.info(f'Tested capacity is {tested_capacity} MB') servers = round(tested_capacity / capacity_per_pod) """ To spread the application pods evenly on all workers or application nodes and at least 2 app pods per node. """ nodes = len( node.get_typed_nodes(node_type=constants.WORKER_MACHINE)) if not with_ocs: nodes = len( machine.get_labeled_nodes( f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}' )) log.info(f'Going to use {nodes} nodes for the test !') servers = round(servers / nodes) * nodes if servers < (nodes * 2): servers = nodes * 2 files = round(tested_capacity / servers / total_dirs) total_files = round(files * servers * total_dirs) log.info(f'number of pods is {servers}') log.info(f'Going to create {total_files} files !') log.info(f'number of files in dir is {files}') """ Setting up the parameters for this test """ if servers: sf_data['spec']['workload']['args']['servers'] = servers target_results = target_results + '-' + str(servers) if threads: sf_data['spec']['workload']['args']['threads'] = threads target_results = target_results + '-' + str(threads) if fileio: sf_data['spec']['workload']['args']['fileio'] = fileio target_results = target_results + '-' + str(fileio) if samples: sf_data['spec']['workload']['args']['samples'] = samples target_results = target_results + '-' + str(samples) if width: sf_data['spec']['workload']['args']['width'] = width target_results = target_results + '-' + str(width) if depth: sf_data['spec']['workload']['args']['depth'] = depth target_results = target_results + '-' + str(depth) if files: sf_data['spec']['workload']['args']['files'] = files target_results = target_results + '-' + str(files) if file_size: sf_data['spec']['workload']['args']['file_size'] = file_size target_results = target_results + '-' + str(file_size) if runtime: sf_data['spec']['workload']['args']['runtime'] = runtime target_results = target_results + '-' + str(runtime) if pause: sf_data['spec']['workload']['args']['pause'] = pause target_results = target_results + '-' + str(pause) if len(blocksize) > 0: sf_data['spec']['workload']['args']['bs'] = blocksize target_results = target_results + '-' + '_'.join(blocksize) if with_ocs: if sf_data['spec']['workload']['args']['pin_server']: del sf_data['spec']['workload']['args']['pin_server'] """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ vol_size = int((files * total_dirs) * file_size * 1.3) log.info('number of files to create : {}'.format( int(files * (width**depth)))) log.info(f'The size of all files is : {vol_size}MB') vol_size = int(vol_size / 1024) if vol_size < 100: vol_size = 100 sf_data['spec']['workload']['args']['storagesize'] = f'{vol_size}Gi' log.debug(f'output of configuration file is {sf_data}') timeout = 86400 # 3600 (1H) * 24 (1D) = one days sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, 'vdbench-client', 'my-ripsaw'): try: if bench_pod[0] is not None: vdbench_client_pod = bench_pod[0] break except IndexError: log.info('Benchmark client pod not ready yet') bench_pod = OCP(kind='pod', namespace='my-ripsaw') log.info('Waiting for VDBench benchmark to Run') assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING, resource_name=vdbench_client_pod, sleep=30, timeout=600) start_time = time.time() while True: logs = bench_pod.exec_oc_cmd(f'logs {vdbench_client_pod}', out_yaml_format=False) if 'Test Run Finished' in logs: log.info('VdBench Benchmark Completed Successfully') break if timeout < (time.time() - start_time): raise TimeoutError( 'Timed out waiting for benchmark to complete') time.sleep(30) # Getting the results file from the benchmark pod and put it with the # test logs. # TODO: find the place of the actual test log and not in the parent # logs path target_results = '{}/{}.tgz'.format(ocsci_log_path(), target_results) pod_results = constants.VDBENCH_RESULTS_FILE retrive_files_from_pod(vdbench_client_pod, target_results, pod_results)
def test_pvc_multiple_snapshot_performance( self, interface_iterate, teardown_factory, storageclass_factory, pvc_factory, pod_factory, ): """ 1. Creating PVC size is depend on storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 80% of data 3. Take a snapshot of the PVC and measure the time of creation. 4. re-write the data on the PVC 5. Take a snapshot of the PVC and measure the time of creation. 6. repeat steps 4-5 the numbers of snapshot we want to take : 512 this will be run by outside script for low memory consumption 7. print all information. Raises: StorageNotSufficientException: in case of not enough capacity """ # Number od snapshot for CephFS is 100 and for RBD is 512 num_of_snaps = 100 if interface_iterate == constants.CEPHBLOCKPOOL: num_of_snaps = 512 # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = int(ceph_cluster.get_ceph_capacity()) # Use 70% of the storage capacity in the test capacity_to_use = int(ceph_capacity * 0.7) # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB need_capacity = int((num_of_snaps + 2) * 1.35) # Test will run only on system with enough capacity if capacity_to_use < need_capacity: err_msg = (f"The system have only {ceph_capacity} GiB, " f"we want to use only {capacity_to_use} GiB, " f"and we need {need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB pvc_size = int(capacity_to_use / (num_of_snaps + 2)) self.interface = interface_iterate self.sc_obj = storageclass_factory(self.interface) self.pvc_obj = pvc_factory(interface=self.interface, size=pvc_size, status=constants.STATUS_BOUND) self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING) # Calculating the file size as 80% of the PVC size filesize = self.pvc_obj.size * 0.80 # Change the file size to MB for the FIO function file_size = f"{int(filesize * constants.GB2MB)}M" file_name = self.pod_obj.name log.info(f"Total capacity size is : {ceph_capacity} GiB, " f"Going to use {need_capacity} GiB, " f"With {num_of_snaps} Snapshots to {pvc_size} GiB PVC. " f"File size to be written is : {file_size} " f"with the name of {file_name}") os.environ["SNAPNUM"] = f"{num_of_snaps}" os.environ["LOGPATH"] = f"{ocsci_log_path()}" os.environ["FILESIZE"] = file_size os.environ["NSPACE"] = self.pvc_obj.namespace os.environ["PODNAME"] = self.pod_obj.name os.environ["PVCNAME"] = self.pvc_obj.name os.environ["INTERFACE"] = self.interface main_script = "tests/e2e/performance/test_multi_snapshots.py" result = subprocess.run([main_script], stdout=subprocess.PIPE) log.info(f"Results from main script : {result.stdout.decode('utf-8')}") if "All results are" not in result.stdout.decode("utf-8"): log.error("Test did not completed") raise Exception("Test did not completed")
def test_pvc_multiple_clone_performance( self, interface_iterate, teardown_factory, storageclass_factory, pvc_factory, pod_factory, ): """ 1. Creating PVC PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 70% of data 3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation times from relevant logs 4. Repeat the previous step number of times (maximal num_of_clones is 512) 5. Print all measured statistics for all the clones. Raises: StorageNotSufficientException: in case of not enough capacity on the cluster """ num_of_clones = 512 # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = int(ceph_cluster.get_ceph_capacity()) # Use 70% of the storage capacity in the test capacity_to_use = int(ceph_capacity * 0.7) # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB need_capacity = int((num_of_clones + 2) * 1.35) # Test will run only on system with enough capacity if capacity_to_use < need_capacity: err_msg = (f"The system have only {ceph_capacity} GiB, " f"we want to use only {capacity_to_use} GiB, " f"and we need {need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB pvc_size = int(capacity_to_use / (num_of_clones + 2)) self.interface = interface_iterate self.sc_obj = storageclass_factory(self.interface) if self.interface == constants.CEPHFILESYSTEM: sc = "CephFS" if self.interface == constants.CEPHBLOCKPOOL: sc = "RBD" self.full_log_path = get_full_test_logs_path(cname=self) self.full_log_path += f"-{sc}" self.pvc_obj = pvc_factory(interface=self.interface, size=pvc_size, status=constants.STATUS_BOUND) self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING) # Calculating the file size as 70% of the PVC size filesize = self.pvc_obj.size * 0.70 # Change the file size to MB for the FIO function file_size = f"{int(filesize * constants.GB2MB)}M" file_name = self.pod_obj.name log.info(f"Total capacity size is : {ceph_capacity} GiB, " f"Going to use {need_capacity} GiB, " f"With {num_of_clones} clones to {pvc_size} GiB PVC. " f"File size to be written is : {file_size} " f"with the name of {file_name}") self.params = {} self.params["clonenum"] = f"{num_of_clones}" self.params["filesize"] = file_size self.params["ERRMSG"] = "Error in command" clone_yaml = self.build_params() performance_lib.write_fio_on_pod(self.pod_obj, file_size) # Running the test results = [] for test_num in range(1, int(self.params["clonenum"]) + 1): log.info(f"Starting test number {test_num}") ct = self.create_clone(test_num, clone_yaml) speed = self.params["datasize"] / ct results.append({"Clone Num": test_num, "time": ct, "speed": speed}) log.info( f"Results for clone number {test_num} are : " f"Creation time is {ct} secs, Creation speed {speed} MB/sec") for r in results: log.info( f"Clone number {r['Clone Num']} creation time is {r['time']} secs." ) log.info( f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec." ) creation_time_list = [r["time"] for r in results] average_creation_time = statistics.mean(creation_time_list) log.info(f"Average creation time is {average_creation_time} secs.") creation_speed_list = [r["speed"] for r in results] average_creation_speed = statistics.mean(creation_speed_list) log.info(f"Average creation speed is {average_creation_time} MB/sec.") self.results_path = get_full_test_logs_path(cname=self) # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_multiple_clone_measurement", )) full_results.add_key("interface", self.interface) full_results.add_key("clones_num", num_of_clones) full_results.add_key("clone_size", pvc_size) full_results.add_key("multi_clone_creation_time", creation_time_list) full_results.add_key("multi_clone_creation_time_average", average_creation_time) full_results.add_key("multi_clone_creation_speed", creation_speed_list) full_results.add_key("multi_clone_creation_speed_average", average_creation_speed) # Write the test results into the ES server if full_results.es_write(): res_link = full_results.results_link() log.info(f"The Result can be found at : {res_link}") # Create text file with results of all subtest (4 - according to the parameters) self.write_result_to_file(res_link)
def test_pvc_multiple_clone_performance( self, interface_iterate, teardown_factory, storageclass_factory, pvc_factory, pod_factory, ): """ 1. Creating PVC PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 70% of data 3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation times from relevant logs 4. Repeat the previous step number of times (maximal num_of_clones is 512) 5. Print all measured statistics for all the clones. Raises: StorageNotSufficientException: in case of not enough capacity on the cluster """ num_of_clones = 512 # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = int(ceph_cluster.get_ceph_capacity()) # Use 70% of the storage capacity in the test capacity_to_use = int(ceph_capacity * 0.7) # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB need_capacity = int((num_of_clones + 2) * 1.35) # Test will run only on system with enough capacity if capacity_to_use < need_capacity: err_msg = (f"The system have only {ceph_capacity} GiB, " f"we want to use only {capacity_to_use} GiB, " f"and we need {need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB pvc_size = int(capacity_to_use / (num_of_clones + 2)) self.interface = interface_iterate self.sc_obj = storageclass_factory(self.interface) self.pvc_obj = pvc_factory(interface=self.interface, size=pvc_size, status=constants.STATUS_BOUND) self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING) # Calculating the file size as 70% of the PVC size filesize = self.pvc_obj.size * 0.70 # Change the file size to MB for the FIO function file_size = f"{int(filesize * constants.GB2MB)}M" file_name = self.pod_obj.name log.info(f"Total capacity size is : {ceph_capacity} GiB, " f"Going to use {need_capacity} GiB, " f"With {num_of_clones} clones to {pvc_size} GiB PVC. " f"File size to be written is : {file_size} " f"with the name of {file_name}") self.params = {} self.params["clonenum"] = f"{num_of_clones}" self.params["filesize"] = file_size self.params["ERRMSG"] = "Error in command" clone_yaml = self.build_params() performance_lib.write_fio_on_pod(self.pod_obj, file_size) # Running the test results = [] for test_num in range(1, int(self.params["clonenum"]) + 1): log.info(f"Starting test number {test_num}") ct = self.create_clone(test_num, clone_yaml) speed = self.params["datasize"] / ct results.append({"Clone Num": test_num, "time": ct, "speed": speed}) log.info( f"Results for clone number {test_num} are : " f"Creation time is {ct} secs, Creation speed {speed} MB/sec") for r in results: log.info( f"Clone number {r['Clone Num']} creation time is {r['time']} secs." ) log.info( f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec." )
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') if interface == 'CephBlockPool': sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if 'elasticsearch' in fio_cr['spec']: backup_es = fio_cr['spec']['elasticsearch'] else: log.warning( 'Elastic Search information does not exists in YAML file') fio_cr['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace( 'GiB', '')) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB' # make sure that the storage size is larger then the file size fio_cr['spec']['workload']['args'][ 'storagesize'] = f'{int(filesize * 1.2)}Gi' fio_cr['spec']['workload']['args']['servers'] = int(total_data_set / filesize) log.info(f'Total Data set to work on is : {total_data_set} GiB') environment = get_environment_info() if not environment['user'] == '': fio_cr['spec']['test_user'] = environment['user'] fio_cr['spec']['clustername'] = environment['clustername'] log.debug(f'Environment information is : {environment}') fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] fio_cr['spec']['workload']['args']['iodepth'] = 1 log.info(f'The FIO CR file is {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, 'fio-client', constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr['spec']['elasticsearch'] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key('io_pattern', io_pattern) full_results.add_key('dataset', f'{total_data_set}GiB') full_results.add_key('file_size', fio_cr['spec']['workload']['args']['filesize']) full_results.add_key('servers', fio_cr['spec']['workload']['args']['servers']) full_results.add_key('samples', fio_cr['spec']['workload']['args']['samples']) full_results.add_key('operations', fio_cr['spec']['workload']['args']['jobs']) full_results.add_key('block_sizes', fio_cr['spec']['workload']['args']['bs']) full_results.add_key('io_depth', fio_cr['spec']['workload']['args']['iodepth']) full_results.add_key('jobs', fio_cr['spec']['workload']['args']['numjobs']) full_results.add_key( 'runtime', { 'read': fio_cr['spec']['workload']['args']['read_runtime'], 'write': fio_cr['spec']['workload']['args']['write_runtime'] }) full_results.add_key( 'storageclass', fio_cr['spec']['workload']['args']['storageclass']) full_results.add_key('vol_size', fio_cr['spec']['workload']['args']['storagesize']) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()) full_results.add_key('test_time', { 'start': start_time, 'end': end_time }) output = run_cmd(f'oc logs {fio_client_pod}') log.info(f'The Test log is : {output}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f'Full results is : {full_results.results}') # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info('Copy all data from Internal ES to Main ES') es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f'The Result can be found at ; {full_results.results_link()}')
def test_pvc_snapshot_performance(self, pvc_size): """ 1. Run I/O on a pod file 2. Calculate md5sum of the file 3. Take a snapshot of the PVC 4. Measure the total snapshot creation time and the CSI snapshot creation time 4. Restore From the snapshot and measure the time 5. Attach a new pod to it 6. Verify that the file is present on the new pod also 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod This scenario run 3 times and report all the average results of the 3 runs and will send them to the ES Args: pvc_size: the size of the PVC to be tested - parametrize """ # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total capacity size is : {ceph_capacity}") log.info(f"PVC Size is : {pvc_size}") log.info(f"Needed capacity is {int(int(pvc_size) * 5)}") if int(ceph_capacity) < int(pvc_size) * 5: log.error( f"PVC size is {pvc_size}GiB and it is too large for this system" f" which have only {ceph_capacity}GiB") return # Calculating the file size as 25% of the PVC size # in the end the PVC will be 75% full filesize = self.pvc_obj.size * 0.25 # Change the file size to MB and from int to str file_size = f"{int(filesize * 1024)}M" all_results = [] self.results_path = get_full_test_logs_path(cname=self) log.info(f"Logs file path name is : {self.full_log_path}") # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_snapshot_perf", )) self.full_results.add_key("pvc_size", pvc_size + " GiB") self.full_results.add_key("interface", self.sc) self.full_results.all_results["creation_time"] = [] self.full_results.all_results["csi_creation_time"] = [] self.full_results.all_results["creation_speed"] = [] self.full_results.all_results["restore_time"] = [] self.full_results.all_results["restore_speed"] = [] self.full_results.all_results["restore_csi_time"] = [] for test_num in range(self.tests_numbers): test_results = { "test_num": test_num + 1, "dataset": (test_num + 1) * filesize * 1024, # size in MiB "create": { "time": None, "csi_time": None, "speed": None }, "restore": { "time": None, "speed": None }, } log.info(f"Starting test phase number {test_num}") # Step 1. Run I/O on a pod file. file_name = f"{self.pod_object.name}-{test_num}" log.info(f"Starting IO on the POD {self.pod_object.name}") # Going to run only write IO to fill the PVC for the snapshot self.pod_object.fillup_fs(size=file_size, fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_object.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {self.pod_object.name}. FIO result: {fio_result}" log.info("IO on the PVC Finished") # Verify presence of the file file_path = pod.get_file_path(self.pod_object, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_object, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_object.name}") # Step 2. Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_object, file_name) # Step 3. Take a snapshot of the PVC and measure the time of creation. snap_name = self.pvc_obj.name.replace("pvc-test", f"snapshot-test{test_num}") log.info(f"Taking snapshot of the PVC {snap_name}") start_time = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") test_results["create"]["time"] = self.measure_create_snapshot_time( pvc_name=self.pvc_obj.name, snap_name=snap_name, namespace=self.pod_object.namespace, interface=self.interface, start_time=start_time, ) test_results["create"][ "csi_time"] = performance_lib.measure_csi_snapshot_creation_time( interface=self.interface, snapshot_id=self.snap_uid, start_time=start_time, ) test_results["create"]["speed"] = int( test_results["dataset"] / test_results["create"]["time"]) log.info( f' Test {test_num} dataset is {test_results["dataset"]} MiB') log.info( f"Snapshot name {snap_name} and id {self.snap_uid} creation time is" f' : {test_results["create"]["time"]} sec.') log.info( f"Snapshot name {snap_name} and id {self.snap_uid} csi creation time is" f' : {test_results["create"]["csi_time"]} sec.') log.info( f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec' ) # Step 4. Restore the PVC from the snapshot and measure the time # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace log.info("Restoring from the Snapshot") restore_pvc_name = self.pvc_obj.name.replace( "pvc-test", f"restore-pvc{test_num}") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if self.interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML csi_start_time = self.get_time("csi") log.info("Restoring the PVC from Snapshot") restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=self.snap_obj.name, namespace=self.snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state( restore_pvc_obj, constants.STATUS_BOUND, timeout=3600 # setting this to 60 Min. # since it can be take long time to restore, and we want it to finished. ) restore_pvc_obj.reload() log.info("PVC was restored from the snapshot") test_results["restore"][ "time"] = helpers.measure_pvc_creation_time( self.interface, restore_pvc_obj.name) test_results["restore"]["speed"] = int( test_results["dataset"] / test_results["restore"]["time"]) log.info( f'Snapshot restore time is : {test_results["restore"]["time"]}' ) log.info( f'restore speed is : {test_results["restore"]["speed"]} MB/sec' ) test_results["restore"][ "csi_time"] = performance_lib.csi_pvc_time_measure( self.interface, restore_pvc_obj, "create", csi_start_time) log.info( f'Snapshot csi restore time is : {test_results["restore"]["csi_time"]}' ) # Step 5. Attach a new pod to the restored PVC restore_pod_object = helpers.create_pod( interface_type=self.interface, pvc_name=restore_pvc_obj.name, namespace=self.snap_obj.namespace, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_object, state=constants.STATUS_RUNNING) restore_pod_object.reload() # Step 6. Verify that the file is present on the new pod also. log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_object.name}") assert pod.check_file_existence( restore_pod_object, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_object.name}") # Step 7. Verify that the md5sum matches log.info( f"Verifying that md5sum of {file_name} " f"on pod {self.pod_object.name} matches with md5sum " f"of the same file on restore pod {restore_pod_object.name}") assert pod.verify_data_integrity( restore_pod_object, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") restore_pod_object.delete() restore_pvc_obj.delete() all_results.append(test_results) # clean the enviroment self.pod_object.delete() self.pvc_obj.delete() self.delete_test_project() # logging the test summary, all info in one place for easy log reading c_speed, c_runtime, c_csi_runtime, r_speed, r_runtime, r_csi_runtime = ( 0 for i in range(6)) log.info("Test summary :") for tst in all_results: c_speed += tst["create"]["speed"] c_runtime += tst["create"]["time"] c_csi_runtime += tst["create"]["csi_time"] r_speed += tst["restore"]["speed"] r_runtime += tst["restore"]["time"] r_csi_runtime += tst["restore"]["csi_time"] self.full_results.all_results["creation_time"].append( tst["create"]["time"]) self.full_results.all_results["csi_creation_time"].append( tst["create"]["csi_time"]) self.full_results.all_results["creation_speed"].append( tst["create"]["speed"]) self.full_results.all_results["restore_time"].append( tst["restore"]["time"]) self.full_results.all_results["restore_speed"].append( tst["restore"]["speed"]) self.full_results.all_results["restore_csi_time"].append( tst["restore"]["csi_time"]) self.full_results.all_results["dataset_inMiB"] = tst["dataset"] log.info( f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. " f"Take snapshot time is {tst['create']['time']} " f"at {tst['create']['speed']} MiB/Sec " f"Restore from snapshot time is {tst['restore']['time']} " f"at {tst['restore']['speed']} MiB/Sec ") avg_snap_c_time = c_runtime / self.tests_numbers avg_snap_csi_c_time = c_csi_runtime / self.tests_numbers avg_snap_c_speed = c_speed / self.tests_numbers avg_snap_r_time = r_runtime / self.tests_numbers avg_snap_r_speed = r_speed / self.tests_numbers avg_snap_r_csi_time = r_csi_runtime / self.tests_numbers log.info(f" Average snapshot creation time is {avg_snap_c_time} sec.") log.info( f" Average csi snapshot creation time is {avg_snap_csi_c_time} sec." ) log.info( f" Average snapshot creation speed is {avg_snap_c_speed} MiB/sec") log.info(f" Average snapshot restore time is {avg_snap_r_time} sec.") log.info( f" Average snapshot restore speed is {avg_snap_r_speed} MiB/sec") log.info( f" Average snapshot restore csi time is {avg_snap_r_csi_time} sec." ) self.full_results.add_key("avg_snap_creation_time_insecs", avg_snap_c_time) self.full_results.add_key("avg_snap_csi_creation_time_insecs", avg_snap_csi_c_time) self.full_results.add_key("avg_snap_creation_speed", avg_snap_c_speed) self.full_results.add_key("avg_snap_restore_time_insecs", avg_snap_r_time) self.full_results.add_key("avg_snap_restore_speed", avg_snap_r_speed) self.full_results.add_key("avg_snap_restore_csi_time_insecs", avg_snap_r_csi_time) # Write the test results into the ES server log.info("writing results to elastic search server") if self.full_results.es_write(): res_link = self.full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") self.write_result_to_file(res_link)
class TestFullClusterHealth(PASTest): """ Test Cluster health when storage is ~85% """ @pytest.fixture(autouse=True) def setup(self, request, nodes): """ Setting up test parameters """ def teardown(): logger.info("cleanup the environment") nodes.restart_nodes_by_stop_and_start_teardown() request.addfinalizer(teardown) logger.info("Starting the test setup") self.percent_to_fill = 85.0 self.ceph_cluster = CephCluster() self.nodes = None self.benchmark_name = "FIO" self.client_pod_name = "fio-client" self.sanity_helpers = sanity_helpers.Sanity() super(TestFullClusterHealth, self).setup() # deploy the benchmark-operator self.deploy_benchmark_operator() def run(self): """ Run the test, and wait until it finished """ self.deploy_and_wait_for_wl_to_start(timeout=900) self.wait_for_wl_to_finish(sleep=300) try: if "Fio failed to execute" not in self.test_logs: logger.info("FIO has completed successfully") except IOError: logger.warning("FIO failed to complete") def calculate_crd_data(self): """ Getting the storage capacity and calculate pod count and pvc size """ ceph_used_capacity_percent = get_percent_used_capacity() logger.info(f"Ceph used capacity percent is {ceph_used_capacity_percent}%") ceph_capacity = self.ceph_cluster.get_ceph_capacity() logger.info(f"Total storage capacity is {ceph_capacity} GiB") self.percent_to_fill = self.percent_to_fill - ceph_used_capacity_percent logger.info(f"Percentage to fill is {self.percent_to_fill}%") self.total_data_set = int(ceph_capacity * (int(self.percent_to_fill) / 100)) self.filesize = int( self.crd_data["spec"]["workload"]["args"]["filesize"].replace("GiB", "") ) # Make sure that filesize>=10 and servers<=60 self.servers = 60 self.filesize = int(self.total_data_set / self.servers) if self.filesize < 10: self.filesize = 10 self.servers = int(self.total_data_set / self.filesize) self.crd_data["spec"]["workload"]["args"]["filesize"] = f"{self.filesize}GiB" self.crd_data["spec"]["workload"]["args"][ "storagesize" ] = f"{int(self.total_data_set)}Gi" self.crd_data["spec"]["workload"]["args"]["servers"] = self.servers self.crd_data["spec"]["workload"]["args"]["bs"] = "1024KiB" self.crd_data["spec"]["workload"]["args"]["jobs"] = ["write", "read"] self.crd_data["spec"]["workload"]["args"]["iodepth"] = 1 def delete_pods(self): """ Try to delete pods: - Rook operator - OSD - MGR - MON """ pod_list = [] rook_operator_pod = pod.get_ocs_operator_pod( ocs_label=constants.OPERATOR_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_list.append(rook_operator_pod) osd_pods = pod.get_osd_pods() pod_list.extend(osd_pods) mgr_pods = pod.get_mgr_pods() pod_list.extend(mgr_pods) mon_pods = pod.get_mon_pods() pod_list.extend(mon_pods) logger.info(f"Deleting pods: {[p.name for p in pod_list]}") pod.delete_pods(pod_objs=pod_list) def ceph_not_health_error(self): """ Check if Ceph is NOT in "HEALTH_ERR" state Warning state is ok since the cluster is low in storage space Returns: bool: True if Ceph state is NOT "HEALTH_ERR" """ ceph_status = self.ceph_cluster.get_ceph_health() logger.info(f"Ceph status is: {ceph_status}") return ceph_status != "HEALTH_ERR" def mgr_pod_node_restart(self): """ Restart node that runs mgr pod """ mgr_pod_obj = pod.get_mgr_pods() mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) self.nodes.restart_nodes([mgr_node_obj]) wait_for_nodes_status() # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mgr", timeout=600 ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) def restart_ocs_operator_node(self): """ Restart node that runs OCS operator pod """ pod_obj = pod.get_ocs_operator_pod() node_obj = pod.get_pod_node(pod_obj) self.nodes.restart_nodes([node_obj]) wait_for_nodes_status() pod.wait_for_pods_to_be_running( namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name] ) def is_cluster_healthy(self): """ Wrapper function for cluster health check Returns: bool: True if ALL checks passed, False otherwise """ return self.ceph_not_health_error() and pod.wait_for_pods_to_be_running() @system_test @polarion_id("OCS-2749") def test_full_cluster_health( self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, ): """ Verify that the cluster health is ok when the storage is ~85% full Steps: 1. Deploy benchmark operator and run fio workload 2. Check Ceph health before/after each operation: 2.1 Osd node reboot 2.2 Mgr node reboot 2.3 OCS operator node reboot 2.4 Delete Rook, OSD, MGR & MON pods 2.5 Creation and deletion of resources """ self.nodes = nodes self.full_log_path = get_full_test_logs_path(cname=self) logger.info(f"Logs file path name is : {self.full_log_path}") logger.info("Create resource file for fio workload") self.crd_data = templating.load_yaml(constants.FIO_CR_YAML) self.calculate_crd_data() self.set_storageclass(interface=constants.CEPHBLOCKPOOL) self.run() logger.info("Checking health before disruptive operations") assert self.is_cluster_healthy(), "Cluster is not healthy" osd_node_reboot() logger.info("Checking health after OSD node reboot") assert self.is_cluster_healthy(), "Cluster is not healthy" self.mgr_pod_node_restart() logger.info("Checking health after worker node shutdown") assert self.is_cluster_healthy(), "Cluster is not healthy" self.restart_ocs_operator_node() logger.info("Checking health after OCS operator node restart") assert self.is_cluster_healthy(), "Cluster is not healthy" self.delete_pods() logger.info("Checking health after Rook, OSD, MGR & MON pods deletion") assert self.is_cluster_healthy(), "Cluster is not healthy" # Create resources logger.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources( pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory ) logger.info("Resources Created") # Delete resources logger.info("Deleting resources") self.sanity_helpers.delete_resources() logger.info("Resources Deleted") logger.info( "Checking health after resources creation and deletion using sanity helpers" ) assert self.is_cluster_healthy(), "Cluster is not healthy"
def test_pvc_snapshot_performance(self, teardown_factory, pvc_size): """ 1. Run I/O on a pod file. 2. Calculate md5sum of the file. 3. Take a snapshot of the PVC and measure the time of creation. 4. Restore From the snapshot and measure the time 5. Attach a new pod to it. 6. Verify that the file is present on the new pod also. 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod. This scenario run 3 times and report all results Args: teardown_factory: A fixture to destroy objects pvc_size: the size of the PVC to be tested - parametrize """ # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total capacity size is : {ceph_capacity}") log.info(f"PVC Size is : {pvc_size}") log.info(f"Needed capacity is {int(int(pvc_size) * 5)}") if int(ceph_capacity) < int(pvc_size) * 5: log.error( f"PVC size is {pvc_size}GiB and it is too large for this system" f" which have only {ceph_capacity}GiB") return # Calculating the file size as 25% of the PVC size # in the end the PVC will be 75% full filesize = self.pvc_obj.size * 0.25 # Change the file size to MB and from int to str file_size = f"{int(filesize * 1024)}M" all_results = [] for test_num in range(self.tests_numbers): test_results = { "test_num": test_num + 1, "dataset": (test_num + 1) * filesize * 1024, # size in MiB "create": { "time": None, "speed": None }, "restore": { "time": None, "speed": None }, } log.info(f"Starting test phase number {test_num}") # Step 1. Run I/O on a pod file. file_name = f"{self.pod_obj.name}-{test_num}" log.info(f"Starting IO on the POD {self.pod_obj.name}") # Going to run only write IO to fill the PVC for the snapshot self.pod_obj.fillup_fs(size=file_size, fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {self.pod_obj.name}. FIO result: {fio_result}" log.info("IO on the PVC Finished") # Verify presence of the file file_path = pod.get_file_path(self.pod_obj, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_obj.name}") # Step 2. Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name) # Step 3. Take a snapshot of the PVC and measure the time of creation. snap_name = self.pvc_obj.name.replace("pvc-test", f"snapshot-test{test_num}") log.info(f"Taking snapshot of the PVC {snap_name}") test_results["create"]["time"] = self.measure_create_snapshot_time( pvc_name=self.pvc_obj.name, snap_name=snap_name, interface=self.interface, ) test_results["create"]["speed"] = int( test_results["dataset"] / test_results["create"]["time"]) log.info( f' Test {test_num} dataset is {test_results["dataset"]} MiB') log.info( f'Snapshot creation time is : {test_results["create"]["time"]} sec.' ) log.info( f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec' ) # Step 4. Restore the PVC from the snapshot and measure the time # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace log.info("Restoring from the Snapshot") restore_pvc_name = self.pvc_obj.name.replace( "pvc-test", f"restore-pvc{test_num}") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if self.interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML log.info("Resorting the PVC from Snapshot") restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=self.snap_obj.name, namespace=self.snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state( restore_pvc_obj, constants.STATUS_BOUND, timeout=3600 # setting this to 60 Min. # since it can be take long time to restore, and we want it to finished. ) teardown_factory(restore_pvc_obj) restore_pvc_obj.reload() log.info("PVC was restored from the snapshot") test_results["restore"][ "time"] = helpers.measure_pvc_creation_time( self.interface, restore_pvc_obj.name) test_results["restore"]["speed"] = int( test_results["dataset"] / test_results["restore"]["time"]) log.info( f'Snapshot restore time is : {test_results["restore"]["time"]}' ) log.info( f'restore sped is : {test_results["restore"]["speed"]} MB/sec') # Step 5. Attach a new pod to the restored PVC restore_pod_obj = helpers.create_pod( interface_type=self.interface, pvc_name=restore_pvc_obj.name, namespace=self.snap_obj.namespace, pod_dict_path=constants.NGINX_POD_YAML, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_obj, state=constants.STATUS_RUNNING) teardown_factory(restore_pod_obj) restore_pod_obj.reload() # Step 6. Verify that the file is present on the new pod also. log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_obj.name}") assert pod.check_file_existence( restore_pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_obj.name}") # Step 7. Verify that the md5sum matches log.info(f"Verifying that md5sum of {file_name} " f"on pod {self.pod_obj.name} matches with md5sum " f"of the same file on restore pod {restore_pod_obj.name}") assert pod.verify_data_integrity( restore_pod_obj, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") all_results.append(test_results) # logging the test summery, all info in one place for easy log reading c_speed, c_runtime, r_speed, r_runtime = (0 for i in range(4)) log.info("Test summery :") for tst in all_results: c_speed += tst["create"]["speed"] c_runtime += tst["create"]["time"] r_speed += tst["restore"]["speed"] r_runtime += tst["restore"]["time"] log.info( f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. " f"Take snapshot time is {tst['create']['time']} " f"at {tst['create']['speed']} MiB/Sec " f"Restore from snapshot time is {tst['restore']['time']} " f"at {tst['restore']['speed']} MiB/Sec ") log.info( f" Average snapshot creation time is {c_runtime / self.tests_numbers} sec." ) log.info( f" Average snapshot creation speed is {c_speed / self.tests_numbers} MiB/sec" ) log.info( f" Average snapshot restore time is {r_runtime / self.tests_numbers} sec." ) log.info( f" Average snapshot restore speed is {r_speed / self.tests_numbers} MiB/sec" )
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml") if interface == "CephBlockPool": sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if "elasticsearch" in fio_cr["spec"]: backup_es = fio_cr["spec"]["elasticsearch"] else: log.warning( "Elastic Search information does not exists in YAML file") fio_cr["spec"]["elasticsearch"] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr["spec"]["elasticsearch"] = { "server": es.get_ip(), "port": es.get_port(), } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB" # make sure that the storage size is larger then the file size fio_cr["spec"]["workload"]["args"][ "storagesize"] = f"{int(filesize * 1.2)}Gi" fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set / filesize) log.info(f"Total Data set to work on is : {total_data_set} GiB") environment = get_environment_info() if not environment["user"] == "": fio_cr["spec"]["test_user"] = environment["user"] fio_cr["spec"]["clustername"] = environment["clustername"] log.debug(f"Environment information is : {environment}") fio_cr["spec"]["workload"]["args"]["storageclass"] = sc if io_pattern == "sequential": fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"] fio_cr["spec"]["workload"]["args"]["iodepth"] = 1 log.info(f"The FIO CR file is {fio_cr}") fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, "fio-client", constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr["spec"]["elasticsearch"] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key("io_pattern", io_pattern) full_results.add_key("dataset", f"{total_data_set}GiB") full_results.add_key("file_size", fio_cr["spec"]["workload"]["args"]["filesize"]) full_results.add_key("servers", fio_cr["spec"]["workload"]["args"]["servers"]) full_results.add_key("samples", fio_cr["spec"]["workload"]["args"]["samples"]) full_results.add_key("operations", fio_cr["spec"]["workload"]["args"]["jobs"]) full_results.add_key("block_sizes", fio_cr["spec"]["workload"]["args"]["bs"]) full_results.add_key("io_depth", fio_cr["spec"]["workload"]["args"]["iodepth"]) full_results.add_key("jobs", fio_cr["spec"]["workload"]["args"]["numjobs"]) full_results.add_key( "runtime", { "read": fio_cr["spec"]["workload"]["args"]["read_runtime"], "write": fio_cr["spec"]["workload"]["args"]["write_runtime"], }, ) full_results.add_key( "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"]) full_results.add_key("vol_size", fio_cr["spec"]["workload"]["args"]["storagesize"]) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) full_results.add_key("test_time", { "start": start_time, "end": end_time }) output = run_cmd(f"oc logs {fio_client_pod}") log.info(f"The Test log is : {output}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f"Full results is : {full_results.results}") # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info("Copy all data from Internal ES to Main ES") es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f"The Result can be found at ; {full_results.results_link()}")
class TestSmallFileWorkloadScale(E2ETest): """ Deploy benchmark operator and run different scale tests. Call common small files workload routine to run SmallFile workload """ def setup(self): """ Initialize the test environment """ # Deploy internal ES server - not need to keep results, # so don't use production ES self.es = ElasticSearch() # Initial the Small Files workload, based on benchmark-operator self.small_files = SmallFiles(self.es) self.ceph_cluster = CephCluster() # Get the total storage capacity self.ceph_capacity = self.ceph_cluster.get_ceph_capacity() log.info(f"Total storage capacity is {self.ceph_capacity:,.2f} GiB") # Collect the pulls usage before the test is starting self.orig_data = self.get_cephfs_data() def teardown(self): """ Teardown the test environment """ self.small_files.cleanup() self.es.cleanup() def get_cephfs_data(self): """ Look through ceph pods and find space usage on all ceph filesystem pods Returns: Dictionary of byte usage, indexed by pod name. """ ceph_status = self.ceph_cluster.toolbox.exec_ceph_cmd(ceph_cmd="ceph df") ret_value = {} for pool in ceph_status["pools"]: # Only the data pool is in our interest (not metadata) if "cephfilesystem" in pool["name"]: ret_value[pool["name"]] = pool["stats"]["bytes_used"] return ret_value def display_ceph_usage(self, msg, data): """ Display the pool usage in a pretty way Args: msg (str): the message string to display with the values data (dict): dictionary of pools -> capacity (in bytes) """ log.info(f"The pools usage {msg} is :") for entry in data: log.info(f"{entry} now uses {data[entry]:,} bytes") @pytest.mark.parametrize( argnames=["file_size", "files", "threads", "interface"], argvalues=[ # 500K Files, ~4GB pytest.param(*[8, 125000, 4, constants.CEPHFILESYSTEM]), # 5M Files, ~152GB pytest.param(*[32, 1250000, 4, constants.CEPHFILESYSTEM]), ], ) def test_scale_smallfile_workload(self, file_size, files, threads, interface): # updating the benchmark parameters self.small_files.setup_storageclass(interface) self.small_files.setup_test_params(file_size, files, threads, 1) # Verify we have enough storage capacity to run the test. self.small_files.setup_vol_size(file_size, files, threads, self.ceph_capacity) # Run the benchmark to create files on the volume self.small_files.setup_operations("create") self.small_files.run() # Collect pools usage after creation is done. self.run_data = self.get_cephfs_data() # Delete the benchmark data self.small_files.delete() # Getting the usage capacity immediately after deletion self.now_data = self.get_cephfs_data() # Wait 3 minutes for the backend deletion actually start. time.sleep(180) # Quarry the storage usage every 2 Min. if no difference between two # samples, the backend cleanup is done. still_going_down = True while still_going_down: log.info("Waiting for Ceph to finish cleaning up") time.sleep(120) self.new_data = self.get_cephfs_data() still_going_down = False for entry in self.new_data: if self.new_data[entry] < self.now_data[entry]: still_going_down = True self.now_data[entry] = self.new_data[entry] self.display_ceph_usage("Before ths test", self.orig_data) self.display_ceph_usage("After data creation", self.run_data) # Make sure that the test actually wrote data to the volume # at least 1GiB. for entry in self.run_data: if re.search("metadata", entry): # Since we are interesting in the data written and not the metadata # skipping the metadata pool continue written = self.run_data[entry] - self.orig_data[entry] check = written > constants.GB errmsg = ( f"{written:,.2f} bytes was written to {entry} -" "This is not enough for the test" ) assert check, errmsg self.display_ceph_usage("After data deletion", self.now_data) for entry in self.now_data: # Leak indicated if over %20 more storage is used and more then 5 GiB. try: ratio = self.now_data[entry] / self.orig_data[entry] except ZeroDivisionError: ratio = self.now_data[entry] added_data = (self.now_data[entry] - self.orig_data[entry]) / constants.GB # in some cases (especially for metadata), it might be that after the # test there is less data in the pool than before the test. if added_data < 0: added_data = 0 ratio = 1 log.info( "The ratio between capacity before and after the test " f"on {entry} is : {ratio:.2f} ; {added_data:,.2f} GiB" ) check = (ratio < 1.20) or (added_data < 3) errmsg = f"{entry} is over 20% (or 3 GiB) larger [{ratio} ; {added_data}]-- possible leak" assert check, errmsg
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs' # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml es_server = "" es_port = "" if 'elasticsearch' in fio_cr['spec']: if 'server' in fio_cr['spec']['elasticsearch']: es_server = fio_cr['spec']['elasticsearch']['server'] if 'port' in fio_cr['spec']['elasticsearch']: es_port = fio_cr['spec']['elasticsearch']['port'] else: fio_cr['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test fio_cr['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } # Setting the data set to 40% of the total storage capacity but # not more then 600GiB ceph_cluster = CephCluster() total_data_set = int(ceph_cluster.get_ceph_capacity() * 0.4) filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace( 'GiB', '')) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_cluster.get_ceph_capacity() * 0.008) fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB' # make sure that the storage size is larger then the file size fio_cr['spec']['workload']['args'][ 'storagesize'] = f'{int(filesize * 1.2)}Gi' fio_cr['spec']['workload']['args']['servers'] = int(total_data_set / filesize) log.info(f'Total Data set to work on is : {total_data_set} GiB') fio_cr['spec']['clustername'] = config.ENV_DATA[ 'platform'] + get_build() + get_ocs_version() fio_cr['spec']['test_user'] = get_ocs_version( ) + interface + io_pattern fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] log.info(f'fio_cr: {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, 'fio-client', constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) output = run_cmd(f'oc logs {fio_client_pod}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() # Setting back the original elastic-search information fio_cr['spec']['elasticsearch'] = { 'server': es_server, 'port': es_port } analyze_regression(io_pattern, sc, es_username=fio_cr['spec']['test_user'])