def test_pvc_multiple_clone_performance( self, interface_iterate, teardown_factory, storageclass_factory, pvc_factory, pod_factory, ): """ 1. Creating PVC PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 70% of data 3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation times from relevant logs 4. Repeat the previous step number of times (maximal num_of_clones is 512) 5. Print all measured statistics for all the clones. Raises: StorageNotSufficientException: in case of not enough capacity on the cluster """ num_of_clones = 512 # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = int(ceph_cluster.get_ceph_capacity()) # Use 70% of the storage capacity in the test capacity_to_use = int(ceph_capacity * 0.7) # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB need_capacity = int((num_of_clones + 2) * 1.35) # Test will run only on system with enough capacity if capacity_to_use < need_capacity: err_msg = (f"The system have only {ceph_capacity} GiB, " f"we want to use only {capacity_to_use} GiB, " f"and we need {need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB pvc_size = int(capacity_to_use / (num_of_clones + 2)) self.interface = interface_iterate self.sc_obj = storageclass_factory(self.interface) if self.interface == constants.CEPHFILESYSTEM: sc = "CephFS" if self.interface == constants.CEPHBLOCKPOOL: sc = "RBD" self.full_log_path = get_full_test_logs_path(cname=self) self.full_log_path += f"-{sc}" self.pvc_obj = pvc_factory(interface=self.interface, size=pvc_size, status=constants.STATUS_BOUND) self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING) # Calculating the file size as 70% of the PVC size filesize = self.pvc_obj.size * 0.70 # Change the file size to MB for the FIO function file_size = f"{int(filesize * constants.GB2MB)}M" file_name = self.pod_obj.name log.info(f"Total capacity size is : {ceph_capacity} GiB, " f"Going to use {need_capacity} GiB, " f"With {num_of_clones} clones to {pvc_size} GiB PVC. " f"File size to be written is : {file_size} " f"with the name of {file_name}") self.params = {} self.params["clonenum"] = f"{num_of_clones}" self.params["filesize"] = file_size self.params["ERRMSG"] = "Error in command" clone_yaml = self.build_params() performance_lib.write_fio_on_pod(self.pod_obj, file_size) # Running the test results = [] for test_num in range(1, int(self.params["clonenum"]) + 1): log.info(f"Starting test number {test_num}") ct = self.create_clone(test_num, clone_yaml) speed = self.params["datasize"] / ct results.append({"Clone Num": test_num, "time": ct, "speed": speed}) log.info( f"Results for clone number {test_num} are : " f"Creation time is {ct} secs, Creation speed {speed} MB/sec") for r in results: log.info( f"Clone number {r['Clone Num']} creation time is {r['time']} secs." ) log.info( f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec." ) creation_time_list = [r["time"] for r in results] average_creation_time = statistics.mean(creation_time_list) log.info(f"Average creation time is {average_creation_time} secs.") creation_speed_list = [r["speed"] for r in results] average_creation_speed = statistics.mean(creation_speed_list) log.info(f"Average creation speed is {average_creation_time} MB/sec.") self.results_path = get_full_test_logs_path(cname=self) # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_multiple_clone_measurement", )) full_results.add_key("interface", self.interface) full_results.add_key("clones_num", num_of_clones) full_results.add_key("clone_size", pvc_size) full_results.add_key("multi_clone_creation_time", creation_time_list) full_results.add_key("multi_clone_creation_time_average", average_creation_time) full_results.add_key("multi_clone_creation_speed", creation_speed_list) full_results.add_key("multi_clone_creation_speed_average", average_creation_speed) # Write the test results into the ES server if full_results.es_write(): res_link = full_results.results_link() log.info(f"The Result can be found at : {res_link}") # Create text file with results of all subtest (4 - according to the parameters) self.write_result_to_file(res_link)
class Sanity: """ Class for cluster health and functional validations """ def __init__(self): """ Initializer for Sanity class - Init CephCluster() in order to set the cluster status before starting the tests """ self.pvc_objs = list() self.pod_objs = list() self.obj_data = "" self.ceph_cluster = CephCluster() def health_check(self, cluster_check=True, tries=20): """ Perform Ceph and cluster health checks """ wait_for_cluster_connectivity(tries=400) logger.info("Checking cluster and Ceph health") node.wait_for_nodes_status(timeout=300) ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=tries) if cluster_check: self.ceph_cluster.cluster_health_check(timeout=60) def create_resources(self, pvc_factory, pod_factory, run_io=True): """ Sanity validation - Create resources (FS and RBD) and run IO Args: pvc_factory (function): A call to pvc_factory function pod_factory (function): A call to pod_factory function run_io (bool): True for run IO, False otherwise """ logger.info( "Creating resources and running IO as a sanity functional validation" ) for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]: pvc_obj = pvc_factory(interface) self.pvc_objs.append(pvc_obj) self.pod_objs.append(pod_factory(pvc=pvc_obj, interface=interface)) if run_io: for pod in self.pod_objs: pod.run_io('fs', '1G', runtime=30) for pod in self.pod_objs: get_fio_rw_iops(pod) self.create_obc() self.verify_obc() def create_obc(self): """ OBC creation for RGW and Nooba """ if config.ENV_DATA['platform'] in constants.ON_PREM_PLATFORMS: obc_rgw = templating.load_yaml(constants.RGW_OBC_YAML) obc_rgw_data_yaml = tempfile.NamedTemporaryFile( mode='w+', prefix='obc_rgw_data', delete=False) templating.dump_data_to_temp_yaml(obc_rgw, obc_rgw_data_yaml.name) logger.info("Creating OBC for rgw") run_cmd(f"oc create -f {obc_rgw_data_yaml.name}", timeout=2400) self.obc_rgw = obc_rgw['metadata']['name'] obc_nooba = templating.load_yaml(constants.MCG_OBC_YAML) obc_mcg_data_yaml = tempfile.NamedTemporaryFile(mode='w+', prefix='obc_mcg_data', delete=False) templating.dump_data_to_temp_yaml(obc_nooba, obc_mcg_data_yaml.name) logger.info("create OBC for mcg") run_cmd(f"oc create -f {obc_mcg_data_yaml.name}", timeout=2400) self.obc_mcg = obc_nooba['metadata']['name'] def delete_obc(self): """ Clenaup OBC resources created above """ if config.ENV_DATA['platform'] in constants.ON_PREM_PLATFORMS: logger.info(f"Deleting rgw obc {self.obc_rgw}") obcrgw = OCP(kind='ObjectBucketClaim', resource_name=f'{self.obc_rgw}') run_cmd(f"oc delete obc/{self.obc_rgw}") obcrgw.wait_for_delete(resource_name=f'{self.obc_rgw}', timeout=300) logger.info(f"Deleting mcg obc {self.obc_mcg}") obcmcg = OCP(kind='ObjectBucketClaim', resource_name=f'{self.obc_mcg}') run_cmd(f"oc delete obc/{self.obc_mcg} -n " f"{defaults.ROOK_CLUSTER_NAMESPACE}") obcmcg.wait_for_delete(resource_name=f'{self.obc_mcg}', timeout=300) def verify_obc(self): """ OBC verification from external cluster perspective, we will check 2 OBCs """ sample = TimeoutSampler(300, 5, self.ceph_cluster.noobaa_health_check) sample.wait_for_func_status(True) def delete_resources(self): """ Sanity validation - Delete resources (FS and RBD) """ logger.info("Deleting resources as a sanity functional validation") self.delete_obc() for pod_obj in self.pod_objs: pod_obj.delete() for pod_obj in self.pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) for pvc_obj in self.pvc_objs: pvc_obj.delete() for pvc_obj in self.pvc_objs: pvc_obj.ocp.wait_for_delete(pvc_obj.name) @ignore_leftovers def create_pvc_delete(self, multi_pvc_factory, project=None): """ Creates and deletes all types of PVCs """ # Create rbd pvcs pvc_objs_rbd = create_pvcs(multi_pvc_factory=multi_pvc_factory, interface='CephBlockPool', project=project, status="", storageclass=None) # Create cephfs pvcs pvc_objs_cephfs = create_pvcs(multi_pvc_factory=multi_pvc_factory, interface='CephFileSystem', project=project, status="", storageclass=None) all_pvc_to_delete = pvc_objs_rbd + pvc_objs_cephfs # Check pvc status for pvc_obj in all_pvc_to_delete: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) # Start deleting PVC delete_pvcs(all_pvc_to_delete) # Check PVCs are deleted for pvc_obj in all_pvc_to_delete: pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) logger.info("All PVCs are deleted as expected") def obc_put_obj_create_delete(self, mcg_obj, bucket_factory): """ Creates bucket then writes, reads and deletes objects """ bucket_name = bucket_factory(amount=1, interface='OC')[0].name self.obj_data = "A string data" for i in range(0, 30): key = 'Object-key-' + f"{i}" logger.info(f"Write, read and delete object with key: {key}") assert s3_put_object(mcg_obj, bucket_name, key, self.obj_data), f"Failed: Put object, {key}" assert s3_get_object(mcg_obj, bucket_name, key), f"Failed: Get object, {key}" assert s3_delete_object(mcg_obj, bucket_name, key), f"Failed: Delete object, {key}"
def test_respin_mcg_pod_and_check_data_integrity_crd( self, mcg_obj, cld_mgr, awscli_pod_session, namespace_store_factory, bucket_factory, test_directory_setup, mcg_pod, ): """ Test Write to ns bucket using CRDs and read directly from AWS. Respin one of mcg pods when data are uploaded. """ logger.info("Create the namespace resources and verify health") nss_tup = ("oc", {"aws": [(1, self.DEFAULT_REGION)]}) ns_store = namespace_store_factory(*nss_tup)[0] logger.info( "Create the namespace bucket on top of the namespace stores") bucketclass_dict = { "interface": "OC", "namespace_policy_dict": { "type": "Single", "namespacestores": [ns_store], }, } logger.info( "Create the namespace bucket on top of the namespace resource") ns_bucket = bucket_factory( amount=1, interface=bucketclass_dict["interface"], bucketclass=bucketclass_dict, )[0].name s3_creds = { "access_key_id": cld_mgr.aws_client.access_key, "access_key": cld_mgr.aws_client.secret_key, "endpoint": constants.MCG_NS_AWS_ENDPOINT, "region": self.DEFAULT_REGION, } original_folder = test_directory_setup.origin_dir result_folder = test_directory_setup.result_dir logger.info("Upload files to NS bucket") self.write_files_to_pod_and_upload( mcg_obj, awscli_pod_session, bucket_to_write=ns_bucket, original_dir=original_folder, amount=3, ) logger.info(f"Respin mcg resource {mcg_pod}") noobaa_pods = pod.get_noobaa_pods() pod_obj = [pod for pod in noobaa_pods if pod.name.startswith(mcg_pod)][0] pod_obj.delete(force=True) logger.info("Wait for noobaa pods to come up") assert pod_obj.ocp.wait_for_resource( condition="Running", selector="app=noobaa", resource_count=len(noobaa_pods), timeout=1000, ) logger.info("Wait for noobaa health to be OK") ceph_cluster_obj = CephCluster() ceph_cluster_obj.wait_for_noobaa_health_ok() logger.info("Read files directly from AWS") self.download_files( mcg_obj, awscli_pod_session, bucket_to_read=ns_store.uls_name, download_dir=result_folder, s3_creds=s3_creds, ) logger.info("Compare between uploaded files and downloaded files") assert self.compare_dirs( awscli_pod_session, origin=original_folder, destination=result_folder, amount=3, )
def test_upgrade_ocp(self): """ Tests OCS stability when upgrading OCP """ ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): ocp_channel = config.UPGRADE.get('ocp_channel', ocp.get_ocp_upgrade_channel()) ocp_upgrade_version = config.UPGRADE.get('ocp_upgrade_version') if not ocp_upgrade_version: ocp_upgrade_version = get_latest_ocp_version( channel=ocp_channel) ocp_arch = config.UPGRADE['ocp_arch'] target_image = f"{ocp_upgrade_version}-{ocp_arch}" elif ocp_upgrade_version.endswith(".nightly"): target_image = expose_ocp_version(ocp_upgrade_version) logger.info(f"Target image; {target_image}") image_path = config.UPGRADE['ocp_upgrade_path'] cluster_operators = ocp.get_all_cluster_operators() logger.info(f" oc version: {ocp.get_current_oc_version()}") # Verify Upgrade subscription channel: ocp.patch_ocp_upgrade_channel(ocp_channel) for sampler in TimeoutSampler(timeout=250, sleep=15, func=ocp.verify_ocp_upgrade_channel, channel_variable=ocp_channel): if sampler: logger.info(f"OCP Channel:{ocp_channel}") break # Upgrade OCP logger.info(f"full upgrade path: {image_path}:{target_image}") ocp.upgrade_ocp(image=target_image, image_path=image_path) # Wait for upgrade for ocp_operator in cluster_operators: logger.info(f"Checking upgrade status of {ocp_operator}:") # ############ Workaround for issue 2624 ####### name_changed_between_versions = ( 'service-catalog-apiserver', 'service-catalog-controller-manager') if ocp_operator in name_changed_between_versions: logger.info(f"{ocp_operator} upgrade will not be verified") continue # ############ End of Workaround ############### ver = ocp.get_cluster_operator_version(ocp_operator) logger.info(f"current {ocp_operator} version: {ver}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.confirm_cluster_operator_version, target_version=target_image, cluster_operator=ocp_operator): if sampler: logger.info(f"{ocp_operator} upgrade completed!") break else: logger.info( f"{ocp_operator} upgrade did not completed yet!") # post upgrade validation: check cluster operator status cluster_operators = ocp.get_all_cluster_operators() for ocp_operator in cluster_operators: logger.info(f"Checking cluster status of {ocp_operator}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.verify_cluster_operator_status, cluster_operator=ocp_operator): if sampler: break else: logger.info(f"{ocp_operator} status is not valid") # Post upgrade validation: check cluster version status logger.info("Checking clusterversion status") for sampler in TimeoutSampler( timeout=900, sleep=15, func=ocp.validate_cluster_version_status): if sampler: logger.info("Upgrade Completed Successfully!") break
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') if interface == 'CephBlockPool': sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if 'elasticsearch' in fio_cr['spec']: backup_es = fio_cr['spec']['elasticsearch'] else: log.warning( 'Elastic Search information does not exists in YAML file') fio_cr['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace( 'GiB', '')) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB' # make sure that the storage size is larger then the file size fio_cr['spec']['workload']['args'][ 'storagesize'] = f'{int(filesize * 1.2)}Gi' fio_cr['spec']['workload']['args']['servers'] = int(total_data_set / filesize) log.info(f'Total Data set to work on is : {total_data_set} GiB') environment = get_environment_info() if not environment['user'] == '': fio_cr['spec']['test_user'] = environment['user'] fio_cr['spec']['clustername'] = environment['clustername'] log.debug(f'Environment information is : {environment}') fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] fio_cr['spec']['workload']['args']['iodepth'] = 1 log.info(f'The FIO CR file is {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, 'fio-client', constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr['spec']['elasticsearch'] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key('io_pattern', io_pattern) full_results.add_key('dataset', f'{total_data_set}GiB') full_results.add_key('file_size', fio_cr['spec']['workload']['args']['filesize']) full_results.add_key('servers', fio_cr['spec']['workload']['args']['servers']) full_results.add_key('samples', fio_cr['spec']['workload']['args']['samples']) full_results.add_key('operations', fio_cr['spec']['workload']['args']['jobs']) full_results.add_key('block_sizes', fio_cr['spec']['workload']['args']['bs']) full_results.add_key('io_depth', fio_cr['spec']['workload']['args']['iodepth']) full_results.add_key('jobs', fio_cr['spec']['workload']['args']['numjobs']) full_results.add_key( 'runtime', { 'read': fio_cr['spec']['workload']['args']['read_runtime'], 'write': fio_cr['spec']['workload']['args']['write_runtime'] }) full_results.add_key( 'storageclass', fio_cr['spec']['workload']['args']['storageclass']) full_results.add_key('vol_size', fio_cr['spec']['workload']['args']['storagesize']) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()) full_results.add_key('test_time', { 'start': start_time, 'end': end_time }) output = run_cmd(f'oc logs {fio_client_pod}') log.info(f'The Test log is : {output}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f'Full results is : {full_results.results}') # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info('Copy all data from Internal ES to Main ES') es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f'The Result can be found at ; {full_results.results_link()}')
class PASTest(BaseTest): """ Base class for QPAS team - Performance and Scale tests This class contain functions which used by performance and scale test, and also can be used by E2E test which used the benchmark-operator (ripsaw) """ def setup(self): """ Setting up the environment for each performance and scale test Args: name (str): The test name that will use in the performance dashboard """ log.info("Setting up test environment") self.es = None # place holder for the incluster deployment elasticsearch self.es_backup = None # place holder for the elasticsearch backup self.main_es = None # place holder for the main elasticsearch object self.benchmark_obj = None # place holder for the benchmark object self.client_pod = None # Place holder for the client pod object self.dev_mode = config.RUN["cli_params"].get("dev_mode") self.pod_obj = OCP(kind="pod", namespace=benchmark_operator.BMO_NAME) self.initialize_test_crd() # Place holders for test results file (all sub-tests together) self.results_file = "" # All tests need a uuid for the ES results, benchmark-operator base test # will overrite it with uuid pulling from the benchmark pod self.uuid = uuid4().hex # Getting the full path for the test logs self.full_log_path = os.environ.get("PYTEST_CURRENT_TEST").split( " ")[0] self.full_log_path = (self.full_log_path.replace("::", "/").replace( "[", "-").replace("]", "")) self.full_log_path = os.path.join(ocsci_log_path(), self.full_log_path) log.info(f"Logs file path name is : {self.full_log_path}") # Getting the results path as a list self.results_path = self.full_log_path.split("/") self.results_path.pop() # List of test(s) for checking the results self.workloads = [] # Collecting all Environment configuration Software & Hardware # for the performance report. self.environment = get_environment_info() self.environment["clusterID"] = get_running_cluster_id() self.ceph_cluster = CephCluster() self.used_capacity = self.get_cephfs_data() self.get_osd_info() self.get_node_info(node_type="master") self.get_node_info(node_type="worker") def teardown(self): if hasattr(self, "operator"): self.operator.cleanup() now_data = self.get_cephfs_data() # Wait 1 minutes for the backend deletion actually start. log.info("Waiting for Ceph to finish cleaning up") time.sleep(60) # Quarry the storage usage every 2 Min. if no difference between two # samples, the backend cleanup is done. still_going_down = True while still_going_down: new_data = self.get_cephfs_data() # no deletion operation is in progress if abs(now_data - new_data) < 1: still_going_down = False # up to 2% inflation of usage is acceptable if new_data > (self.used_capacity * 1.02): log.warning( f"usage capacity after the test ({new_data:.2f} GiB) " f"is more then in the begining of it ({self.used_capacity:.2f} GiB)" ) else: log.info(f"Last usage : {now_data}, Current usage {new_data}") now_data = new_data log.info("Waiting for Ceph to finish cleaning up") time.sleep(120) still_going_down = True log.info("Storage usage was cleandup") def initialize_test_crd(self): """ Initializing the test CRD file. this include the Elasticsearch info, cluster name and user name which run the test """ self.crd_data = { "spec": { "test_user": "******", # place holde only will be change in the test. "clustername": "test_cluster", # place holde only will be change in the test. "elasticsearch": { "server": config.PERF.get("production_es_server"), "port": config.PERF.get("production_es_port"), "url": f"http://{config.PERF.get('production_es_server')}:{config.PERF.get('production_es_port')}", }, } } # during development use the dev ES so the data in the Production ES will be clean. if self.dev_mode: self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("dev_es_server"), "port": config.PERF.get("dev_es_port"), "url": f"http://{config.PERF.get('dev_es_server')}:{config.PERF.get('dev_es_port')}", } def create_new_pool(self, pool_name): """ Creating new Storage pool for RBD / CephFS to use in a test so it can be deleted in the end of the test for fast cleanup Args: pool_name (str): the name of the pool to create """ if self.interface == constants.CEPHBLOCKPOOL: self.ceph_cluster.create_new_blockpool(pool_name=pool_name) self.ceph_cluster.set_pgs(poolname=pool_name, pgs=128) elif self.interface == constants.CEPHFILESYSTEM: self.ceph_cluster.create_new_filesystem(fs_name=pool_name) self.ceph_cluster.toolbox.exec_ceph_cmd( f"ceph fs subvolumegroup create {pool_name} csi") self.ceph_cluster.set_pgs(poolname=f"{pool_name}-data0", pgs=128) self.ceph_cluster.set_target_ratio( poolname="ocs-storagecluster-cephblockpool", ratio=0.24) self.ceph_cluster.set_target_ratio( poolname="ocs-storagecluster-cephfilesystem-data0", ratio=0.24) return def delete_ceph_pool(self, pool_name): """ Delete Storage pool (RBD / CephFS) that was created for the test for fast cleanup. Args: pool_name (str): the name of the pool to be delete """ if self.interface == constants.CEPHBLOCKPOOL: self.ceph_cluster.delete_blockpool(pool_name=pool_name) elif self.interface == constants.CEPHFILESYSTEM: self.ceph_cluster.delete_filesystem(fs_name=pool_name) self.ceph_cluster.set_target_ratio( poolname="ocs-storagecluster-cephblockpool", ratio=0.49) self.ceph_cluster.set_target_ratio( poolname="ocs-storagecluster-cephfilesystem-data0", ratio=0.49) return def get_cephfs_data(self): """ Look through ceph pods and find space usage on all ceph pools Returns: int: total used capacity in GiB. """ ceph_status = self.ceph_cluster.toolbox.exec_ceph_cmd( ceph_cmd="ceph df") total_used = 0 for pool in ceph_status["pools"]: total_used += pool["stats"]["bytes_used"] return total_used / constants.GB def get_osd_info(self): """ Getting the OSD's information and update the main environment dictionary. """ ct_pod = pod.get_ceph_tools_pod() osd_info = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") self.environment["osd_size"] = osd_info.get("nodes")[0].get( "crush_weight") self.environment["osd_num"] = len(osd_info.get("nodes")) self.environment["total_capacity"] = osd_info.get("summary").get( "total_kb_avail") self.environment["ocs_nodes_num"] = len(node.get_ocs_nodes()) def get_node_info(self, node_type="master"): """ Getting node type hardware information and update the main environment dictionary. Args: node_type (str): the node type to collect data about, can be : master / worker - the default is master """ if node_type == "master": nodes = node.get_master_nodes() elif node_type == "worker": nodes = node.get_worker_nodes() else: log.warning(f"Node type ({node_type}) is invalid") return oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE) self.environment[f"{node_type}_nodes_num"] = len(nodes) self.environment[ f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"], ).rstrip() self.environment[ f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd( node=nodes[0], cmd_list=["free | grep Mem | awk '{print $2}'"]).rstrip() def deploy_benchmark_operator(self): """ Deploy the benchmark operator """ self.operator = benchmark_operator.BenchmarkOperator() self.operator.deploy() def es_info_backup(self, elasticsearch): """ Saving the Original elastic-search IP and PORT - if defined in yaml Args: elasticsearch (obj): elasticsearch object """ self.crd_data["spec"]["elasticsearch"] = {} # for development mode use the Dev ES server if self.dev_mode and config.PERF.get("dev_lab_es"): log.info("Using the development ES server") self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("dev_es_server"), "port": config.PERF.get("dev_es_port"), "url": f"http://{config.PERF.get('dev_es_server')}:{config.PERF.get('dev_es_port')}", "parallel": True, } # for production mode use the Lab ES server if not self.dev_mode and config.PERF.get("production_es"): self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("production_es_server"), "port": config.PERF.get("production_es_port"), "url": f"http://{config.PERF.get('production_es_server')}:{config.PERF.get('production_es_port')}", "parallel": True, } # backup the Main ES info (if exists) if not self.crd_data["spec"]["elasticsearch"] == {}: self.backup_es = self.crd_data["spec"]["elasticsearch"] log.info( f"Creating object for the Main ES server on {self.backup_es['url']}" ) self.main_es = Elasticsearch([self.backup_es["url"]], verify_certs=True) else: log.warning( "Elastic Search information does not exists for this test") # Use the internal define elastic-search server in the test - if exist if elasticsearch: if not isinstance(elasticsearch, dict): # elasticsearch is an internally deployed server (obj) ip = elasticsearch.get_ip() port = elasticsearch.get_port() else: # elasticsearch is an existing server (dict) ip = elasticsearch.get("server") port = elasticsearch.get("port") self.crd_data["spec"]["elasticsearch"] = { "server": ip, "port": port, "url": f"http://{ip}:{port}", "parallel": True, } log.info( f"Going to use the ES : {self.crd_data['spec']['elasticsearch']}" ) elif config.PERF.get("internal_es_server"): # use an in-cluster elastic-search (not deployed by the test) self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("internal_es_server"), "port": config.PERF.get("internal_es_port"), "url": f"http://{config.PERF.get('internal_es_server')}:{config.PERF.get('internal_es_port')}", "parallel": True, } def set_storageclass(self, interface): """ Setting the benchmark CRD storageclass Args: interface (str): The interface which will used in the test """ if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using [{storageclass}] Storageclass") self.crd_data["spec"]["workload"]["args"][ "storageclass"] = storageclass def get_env_info(self): """ Getting the environment information and update the workload RC if necessary. """ if not self.environment["user"] == "": self.crd_data["spec"]["test_user"] = self.environment["user"] else: # since full results object need this parameter, initialize it from CR file self.environment["user"] = self.crd_data["spec"]["test_user"] self.crd_data["spec"]["clustername"] = self.environment["clustername"] log.debug(f"Environment information is : {self.environment}") def deploy_and_wait_for_wl_to_start(self, timeout=300, sleep=20): """ Deploy the workload and wait until it start working Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds """ log.debug(f"The {self.benchmark_name} CR file is {self.crd_data}") self.benchmark_obj = OCS(**self.crd_data) self.benchmark_obj.create() # This time is only for reporting - when the benchmark started. self.start_time = self.get_time() # Wait for benchmark client pod to be created log.info(f"Waiting for {self.client_pod_name} to Start") for bm_pod in TimeoutSampler( timeout, sleep, get_pod_name_by_pattern, self.client_pod_name, benchmark_operator.BMO_NAME, ): try: if bm_pod[0] is not None: self.client_pod = bm_pod[0] break except IndexError: log.info("Bench pod is not ready yet") # Sleeping for 15 sec for the client pod to be fully accessible time.sleep(15) log.info(f"The benchmark pod {self.client_pod_name} is Running") def wait_for_wl_to_finish(self, timeout=18000, sleep=300): """ Waiting until the workload is finished and get the test log Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds Raise: exception for too much restarts of the test. ResourceWrongStatusException : test Failed / Error TimeoutExpiredError : test did not completed on time. """ log.info(f"Waiting for {self.client_pod_name} to complete") Finished = 0 restarts = 0 total_time = timeout while not Finished and total_time > 0: results = run_oc_command( "get pod --no-headers -o custom-columns=:metadata.name,:status.phase", namespace=benchmark_operator.BMO_NAME, ) (fname, status) = ["", ""] for name in results: # looking for the pod which run the benchmark (not the IO) # this pod contain the `client` in his name, and there is only one # pod like this, other pods have the `server` in the name. (fname, status) = name.split() if re.search("client", fname): break else: (fname, status) = ["", ""] if fname == "": # there is no `client` pod ! err_msg = f"{self.client_pod} Failed to run !!!" log.error(err_msg) raise Exception(err_msg) if not fname == self.client_pod: # The client pod name is different from previous check, it was restarted log.info( f"The pod {self.client_pod} was restart. the new client pod is {fname}" ) self.client_pod = fname restarts += 1 # in case of restarting the benchmark, reset the timeout as well total_time = timeout if restarts > 3: # we are tolerating only 3 restarts err_msg = f"Too much restarts of the benchmark ({restarts})" log.error(err_msg) raise Exception(err_msg) if status == "Succeeded": # Getting the end time of the benchmark - for reporting. self.end_time = self.get_time() self.test_logs = self.pod_obj.exec_oc_cmd( f"logs {self.client_pod}", out_yaml_format=False) log.info(f"{self.client_pod} completed successfully") Finished = 1 elif (status != constants.STATUS_RUNNING and status != constants.STATUS_PENDING): # if the benchmark pod is not in Running state (and not Completed/Pending), # no need to wait for timeout. # Note: the pod can be in pending state in case of restart. err_msg = f"{self.client_pod} Failed to run - ({status})" log.error(err_msg) raise exceptions.ResourceWrongStatusException( self.client_pod, describe_out=err_msg, column="Status", expected="Succeeded", got=status, ) else: log.info( f"{self.client_pod} is in {status} State, and wait to Succeeded State." f" wait another {sleep} sec. for benchmark to complete") time.sleep(sleep) total_time -= sleep if not Finished: err_msg = (f"{self.client_pod} did not completed on time, " f"maybe timeout ({timeout}) need to be increase") log.error(err_msg) raise exceptions.TimeoutExpiredError(self.client_pod, custom_message=err_msg) # Saving the benchmark internal log into a file at the logs directory log_file_name = f"{self.full_log_path}/test-pod.log" try: with open(log_file_name, "w") as f: f.write(self.test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning(f"Cannot write the log to the file {log_file_name}") log.info(f"The {self.benchmark_name} benchmark complete") def copy_es_data(self, elasticsearch): """ Copy data from Internal ES (if exists) to the main ES Args: elasticsearch (obj): elasticsearch object (if exits) """ log.info(f"In copy_es_data Function - {elasticsearch}") if elasticsearch: log.info("Copy all data from Internal ES to Main ES") log.info("Dumping data from the Internal ES to tar ball file") elasticsearch.dumping_all_data(self.full_log_path) es_connection = self.backup_es es_connection["host"] = es_connection.pop("server") es_connection.pop("url") if elasticsearch_load(self.main_es, self.full_log_path): # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(10) log.info( f"All raw data for tests results can be found at : {self.full_log_path}" ) return True else: log.warning("Cannot upload data into the Main ES server") return False def read_from_es(self, es, index, uuid): """ Reading all results from elasticsearch server Args: es (dict): dictionary with elasticsearch info {server, port} index (str): the index name to read from the elasticsearch server uuid (str): the test UUID to find in the elasticsearch server Returns: list : list of all results """ con = Elasticsearch([{"host": es["server"], "port": es["port"]}]) query = {"size": 1000, "query": {"match": {"uuid": uuid}}} try: results = con.search(index=index, body=query) full_data = [] for res in results["hits"]["hits"]: full_data.append(res["_source"]) return full_data except Exception as e: log.warning(f"{index} Not found in the Internal ES. ({e})") return [] def es_connect(self): """ Create elasticsearch connection to the server Return: bool : True if there is a connection to the ES, False if not. """ OK = True # the return value try: log.info( f"try to connect the ES : {self.es['server']}:{self.es['port']}" ) self.es_con = Elasticsearch([{ "host": self.es["server"], "port": self.es["port"] }]) except Exception: log.error(f"Cannot connect to ES server {self.es}") OK = False # Testing the connection to the elastic-search if not self.es_con.ping(): log.error(f"Cannot connect to ES server {self.es}") OK = False return OK def get_kibana_indexid(self, server, name): """ Get the kibana Index ID by its name. Args: server (str): the IP (or name) of the Kibana server name (str): the name of the index Returns: str : the index ID of the given name return None if the index does not exist. """ port = 5601 http_link = f"http://{server}:{port}/api/saved_objects" search_string = f"_find?type=index-pattern&search_fields=title&search='{name}'" log.info(f"Connecting to Kibana {server} on port {port}") try: res = requests.get(f"{http_link}/{search_string}") res = json.loads(res.content.decode()) for ind in res.get("saved_objects"): if ind.get("attributes").get("title") in [name, f"{name}*"]: log.info( f"The Kibana indexID for {name} is {ind.get('id')}") return ind.get("id") except esexp.ConnectionError: log.warning("Cannot connect to Kibana server {}:{}".format( server, port)) log.warning(f"Can not find the Kibana index : {name}") return None def write_result_to_file(self, res_link): """ Write the results link into file, to combine all sub-tests results together in one file, so it can be easily pushed into the performance dashboard Args: res_link (str): http link to the test results in the ES server """ if not os.path.exists(self.results_path): os.makedirs(self.results_path) self.results_file = os.path.join(self.results_path, "all_results.txt") log.info(f"Try to push results into : {self.results_file}") try: with open(self.results_file, "a+") as f: f.write(f"{res_link}\n") f.close() except FileNotFoundError: log.info("The file does not exist, so create new one.") with open(self.results_file, "w+") as f: f.write(f"{res_link}\n") f.close() except OSError as err: log.error(f"OS error: {err}") @staticmethod def get_time(time_format=None): """ Getting the current GMT time in a specific format for the ES report, or for seeking in the containers log Args: time_format (str): which thime format to return - None / CSI Returns: str : current date and time in formatted way """ formated = "%Y-%m-%dT%H:%M:%SGMT" if time_format and time_format.lower() == "csi": formated = "%Y-%m-%dT%H:%M:%SZ" return time.strftime(formated, time.gmtime()) def check_tests_results(self): """ Check that all sub-tests (test multiplication by parameters) finished and pushed the data to the ElastiSearch server. It also generate the es link to push into the performance dashboard. """ es_links = [] try: with open(self.results_file, "r") as f: data = f.read().split("\n") data.pop() # remove the last empty element if len(data) != self.number_of_tests: log.error("Not all tests finished") raise exceptions.BenchmarkTestFailed() else: log.info( "All test finished OK, and the results can be found at :") for res in data: log.info(res) es_links.append(res) except OSError as err: log.error(f"OS error: {err}") raise err self.es_link = ",".join(es_links) def push_to_dashboard(self, test_name): """ Pushing the test results into the performance dashboard, if exist Args: test_name (str): the test name as defined in the performance dashboard Returns: None in case of pushing the results to the dashboard failed """ try: db = PerfDash() except MissingRequiredConfigKeyError as ex: log.error( f"Results cannot be pushed to the performance dashboard, no connection [{ex}]" ) return None log.info(f"Full version is : {self.environment.get('ocs_build')}") version = self.environment.get("ocs_build").split("-")[0] try: build = self.environment.get("ocs_build").split("-")[1] build = build.split(".")[0] except Exception: build = "GA" # Getting the topology from the cluster az = node.get_odf_zone_count() if az == 0: az = 1 topology = f"{az}-AZ" # Check if it is Arbiter cluster my_obj = OCP(kind="StorageCluster", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) arbiter = (my_obj.data.get("items")[0].get("spec").get("arbiter").get( "enable", False)) if arbiter: topology = "Strech-Arbiter" # Check if run on LSO try: ns = OCP(kind="namespace", resource_name=defaults.LOCAL_STORAGE_NAMESPACE) ns.get() platform = f"{self.environment.get('platform')}-LSO" except Exception: platform = self.environment.get("platform") # Check if encrypted cluster encrypt = ( my_obj.data.get("items")[0].get("spec").get("encryption").get( "enable", False)) kms = (my_obj.data.get("items")[0].get("spec").get("encryption").get( "kms").get("enable", False)) if kms: platform = f"{platform}-KMS" elif encrypt: platform = f"{platform}-Enc" # Check the base storageclass on AWS if self.environment.get("platform").upper() == "AWS": osd_pod_list = pod.get_osd_pods() osd_pod = osd_pod_list[0].pod_data["metadata"]["name"] osd_pod_obj = OCP( kind="POD", resource_name=osd_pod, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) log.info(f"The First OSD pod nams is {osd_pod}") osd_pvc_name = osd_pod_obj.get( )["spec"]["initContainers"][0]["volumeDevices"][0]["name"] log.info(f"The First OSD name is : {osd_pvc_name}") osd_pvc_obj = OCP( kind="PersistentVolumeClaim", resource_name=osd_pvc_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) odf_back_storage = osd_pvc_obj.get()["spec"]["storageClassName"] log.info( f"The ODF deployment use {odf_back_storage} as back storage") if odf_back_storage != "gp2": platform = f"{platform}-{odf_back_storage}" if self.dev_mode: port = "8181" else: port = "8080" try: log.info("Trying to push :" f"version={version}," f"build={build}," f"platform={platform}," f"topology={topology}," f"test={test_name}," f"eslink={self.es_link}, logfile=None") db.add_results( version=version, build=build, platform=platform, topology=topology, test=test_name, eslink=self.es_link, logfile=None, ) resultslink = (f"http://{db.creds['host']}:{port}/index.php?" f"version1={db.get_version_id(version)}" f"&build1={db.get_build_id(version, build)}" f"&platform1={db.get_platform_id(platform)}" f"&az_topology1={db.get_topology_id(topology)}" f"&test_name%5B%5D={db.get_test_id(test_name)}" "&submit=Choose+options") log.info(f"Full results report can be found at : {resultslink}") except Exception as ex: log.error( f"Can not push results into the performance Dashboard! [{ex}]") db.cleanup() def add_test_to_results_check(self, test, test_count, test_name): """ Adding test information to list of test(s) that we want to check the results and push them to the dashboard. Args: test (str): the name of the test function that we want to check test_count (int): number of test(s) that need to run - according to parametize test_name (str): the test name in the Performance dashboard """ self.workloads.append({ "name": test, "tests": test_count, "test_name": test_name }) def check_results_and_push_to_dashboard(self): """ Checking test(s) results - that all test(s) are finished OK, and push the results into the performance dashboard """ for wl in self.workloads: self.number_of_tests = wl["tests"] self.results_file = os.path.join("/", *self.results_path, wl["name"], "all_results.txt") log.info( f"Check results for [{wl['name']}] in : {self.results_file}") self.check_tests_results() self.push_to_dashboard(test_name=wl["test_name"]) def create_test_project(self): """ Creating new project (namespace) for performance test """ self.namespace = "pas-test-namespace" log.info(f"Creating new namespace ({self.namespace}) for the test") try: self.proj = helpers.create_project(project_name=self.namespace) except CommandFailed as ex: if str(ex).find("(AlreadyExists)"): log.warning("The namespace already exists !") log.error("Cannot create new project") raise CommandFailed(f"{self.namespace} was not created") def delete_test_project(self): """ Deleting the performance test project (namespace) """ log.info(f"Deleting the test namespace : {self.namespace}") switch_to_default_rook_cluster_project() try: self.proj.delete(resource_name=self.namespace) self.proj.wait_for_delete(resource_name=self.namespace, timeout=60, sleep=10) except CommandFailed: log.error(f"Cannot delete project {self.namespace}") raise CommandFailed(f"{self.namespace} was not created") def set_results_path_and_file(self, func_name): """ Setting the results_path and results_file parameter for a specific test Args: func_name (str): the name of the function which use for the test """ self.results_path = os.path.join("/", *self.results_path, func_name) self.results_file = os.path.join(self.results_path, "all_results.txt")
def test_upgrade_ocp(self): """ Tests OCS stability when upgrading OCP """ ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): ocp_upgrade_version = config.UPGRADE.get('ocp_upgrade_version') if not ocp_upgrade_version: ocp_channel = config.UPGRADE['ocp_channel'] ocp_upgrade_version = get_latest_ocp_version( channel=ocp_channel) ocp_arch = config.UPGRADE['ocp_arch'] target_image = f"{ocp_upgrade_version}-{ocp_arch}" elif ocp_upgrade_version.endswith(".nightly"): target_image = expose_ocp_version(ocp_upgrade_version) logger.info(f"Target image; {target_image}") image_path = config.UPGRADE['ocp_upgrade_path'] self.cluster_operators = ocp.get_all_cluster_operators() logger.info(f" oc version: {ocp.get_current_oc_version()}") # Verify Upgrade subscription channel: ocp.patch_ocp_upgrade_channel(ocp_channel) for sampler in TimeoutSampler(timeout=250, sleep=15, func=ocp.verify_ocp_upgrade_channel, channel_variable=ocp_channel): if sampler: logger.info(f"OCP Channel:{ocp_channel}") break # Upgrade OCP logger.info(f"full upgrade path: {image_path}:{target_image}") ocp.upgrade_ocp(image=target_image, image_path=image_path) # Wait for upgrade for ocp_operator in self.cluster_operators: logger.info(f"Checking upgrade status of {ocp_operator}:") ver = ocp.get_cluster_operator_version(ocp_operator) logger.info(f"current {ocp_operator} version: {ver}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.confirm_cluster_operator_version, target_version=target_image, cluster_operator=ocp_operator): logger.info( f"ClusterOperator upgrade " f"{'completed!' if sampler else 'did not completed yet!'}" ) if sampler: break # post upgrade validation: check cluster operator status for ocp_operator in self.cluster_operators: logger.info(f"Checking cluster status of {ocp_operator}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.verify_cluster_operator_status, cluster_operator=ocp_operator): logger.info( f"ClusterOperator status is " f"{'valid' if sampler else 'status is not valid'}") if sampler: break # Post upgrade validation: check cluster version status logger.info("Checking clusterversion status") for sampler in TimeoutSampler( timeout=900, sleep=15, func=ocp.validate_cluster_version_status): if sampler: logger.info("Upgrade Completed Successfully!") break
class Sanity: """ Class for cluster health and functional validations """ def __init__(self): """ Initializer for Sanity class - Init CephCluster() in order to set the cluster status before starting the tests """ self.pvc_objs = list() self.pod_objs = list() self.obj_data = "" self.ceph_cluster = CephCluster() def health_check(self, cluster_check=True, tries=20): """ Perform Ceph and cluster health checks """ wait_for_cluster_connectivity(tries=400) logger.info("Checking cluster and Ceph health") node.wait_for_nodes_status(timeout=300) ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=tries) if cluster_check: self.ceph_cluster.cluster_health_check(timeout=60) def create_resources(self, pvc_factory, pod_factory, run_io=True): """ Sanity validation - Create resources (FS and RBD) and run IO Args: pvc_factory (function): A call to pvc_factory function pod_factory (function): A call to pod_factory function run_io (bool): True for run IO, False otherwise """ logger.info( "Creating resources and running IO as a sanity functional validation" ) for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]: pvc_obj = pvc_factory(interface) self.pvc_objs.append(pvc_obj) self.pod_objs.append(pod_factory(pvc=pvc_obj, interface=interface)) if run_io: for pod in self.pod_objs: pod.run_io('fs', '1G', runtime=30) for pod in self.pod_objs: get_fio_rw_iops(pod) def delete_resources(self): """ Sanity validation - Delete resources (FS and RBD) """ logger.info("Deleting resources as a sanity functional validation") for pod_obj in self.pod_objs: pod_obj.delete() for pod_obj in self.pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) for pvc_obj in self.pvc_objs: pvc_obj.delete() for pvc_obj in self.pvc_objs: pvc_obj.ocp.wait_for_delete(pvc_obj.name) @ignore_leftovers def create_pvc_delete(self, multi_pvc_factory, project=None): """ Creates and deletes all types of PVCs """ # Create rbd pvcs pvc_objs_rbd = create_pvcs(multi_pvc_factory=multi_pvc_factory, interface='CephBlockPool', project=project, status="", storageclass=None) # Create cephfs pvcs pvc_objs_cephfs = create_pvcs(multi_pvc_factory=multi_pvc_factory, interface='CephFileSystem', project=project, status="", storageclass=None) all_pvc_to_delete = pvc_objs_rbd + pvc_objs_cephfs # Check pvc status for pvc_obj in all_pvc_to_delete: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) # Start deleting PVC delete_pvcs(all_pvc_to_delete) # Check PVCs are deleted for pvc_obj in all_pvc_to_delete: pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) logger.info("All PVCs are deleted as expected") def obc_put_obj_create_delete(self, mcg_obj, bucket_factory): """ Creates bucket then writes, reads and deletes objects """ bucket_name = bucket_factory(amount=1, interface='OC')[0].name self.obj_data = "A string data" for i in range(0, 30): key = 'Object-key-' + f"{i}" logger.info(f"Write, read and delete object with key: {key}") assert s3_put_object(mcg_obj, bucket_name, key, self.obj_data), f"Failed: Put object, {key}" assert s3_get_object(mcg_obj, bucket_name, key), f"Failed: Get object, {key}" assert s3_delete_object(mcg_obj, bucket_name, key), f"Failed: Delete object, {key}"
def unset_noout(): """ unset_noout with 10 retries and delay of 10 seconds. """ ceph = CephCluster() ceph.unset_noout()
class TestSmallFileWorkloadScale(E2ETest): """ Deploy benchmark operator and run different scale tests. Call common small files workload routine to run SmallFile workload """ def setup(self): """ Initialize the test environment """ # Deploy internal ES server - not need to keep results, # so don't use production ES self.es = ElasticSearch() # Initial the Small Files workload, based on benchmark-operator self.small_files = SmallFiles(self.es) self.ceph_cluster = CephCluster() # Get the total storage capacity self.ceph_capacity = self.ceph_cluster.get_ceph_capacity() log.info(f"Total storage capacity is {self.ceph_capacity:,.2f} GiB") # Collect the pulls usage before the test is starting self.orig_data = self.get_cephfs_data() def teardown(self): """ Teardown the test environment """ self.small_files.cleanup() self.es.cleanup() def get_cephfs_data(self): """ Look through ceph pods and find space usage on all ceph filesystem pods Returns: Dictionary of byte usage, indexed by pod name. """ ceph_status = self.ceph_cluster.toolbox.exec_ceph_cmd(ceph_cmd="ceph df") ret_value = {} for pool in ceph_status["pools"]: # Only the data pool is in our interest (not metadata) if "cephfilesystem" in pool["name"]: ret_value[pool["name"]] = pool["stats"]["bytes_used"] return ret_value def display_ceph_usage(self, msg, data): """ Display the pool usage in a pretty way Args: msg (str): the message string to display with the values data (dict): dictionary of pools -> capacity (in bytes) """ log.info(f"The pools usage {msg} is :") for entry in data: log.info(f"{entry} now uses {data[entry]:,} bytes") @pytest.mark.parametrize( argnames=["file_size", "files", "threads", "interface"], argvalues=[ # 500K Files, ~4GB pytest.param(*[8, 125000, 4, constants.CEPHFILESYSTEM]), # 5M Files, ~152GB pytest.param(*[32, 1250000, 4, constants.CEPHFILESYSTEM]), ], ) def test_scale_smallfile_workload(self, file_size, files, threads, interface): # updating the benchmark parameters self.small_files.setup_storageclass(interface) self.small_files.setup_test_params(file_size, files, threads, 1) # Verify we have enough storage capacity to run the test. self.small_files.setup_vol_size(file_size, files, threads, self.ceph_capacity) # Run the benchmark to create files on the volume self.small_files.setup_operations("create") self.small_files.run() # Collect pools usage after creation is done. self.run_data = self.get_cephfs_data() # Delete the benchmark data self.small_files.delete() # Getting the usage capacity immediately after deletion self.now_data = self.get_cephfs_data() # Wait 3 minutes for the backend deletion actually start. time.sleep(180) # Quarry the storage usage every 2 Min. if no difference between two # samples, the backend cleanup is done. still_going_down = True while still_going_down: log.info("Waiting for Ceph to finish cleaning up") time.sleep(120) self.new_data = self.get_cephfs_data() still_going_down = False for entry in self.new_data: if self.new_data[entry] < self.now_data[entry]: still_going_down = True self.now_data[entry] = self.new_data[entry] self.display_ceph_usage("Before ths test", self.orig_data) self.display_ceph_usage("After data creation", self.run_data) # Make sure that the test actually wrote data to the volume # at least 1GiB. for entry in self.run_data: if re.search("metadata", entry): # Since we are interesting in the data written and not the metadata # skipping the metadata pool continue written = self.run_data[entry] - self.orig_data[entry] check = written > constants.GB errmsg = ( f"{written:,.2f} bytes was written to {entry} -" "This is not enough for the test" ) assert check, errmsg self.display_ceph_usage("After data deletion", self.now_data) for entry in self.now_data: # Leak indicated if over %20 more storage is used and more then 5 GiB. try: ratio = self.now_data[entry] / self.orig_data[entry] except ZeroDivisionError: ratio = self.now_data[entry] added_data = (self.now_data[entry] - self.orig_data[entry]) / constants.GB # in some cases (especially for metadata), it might be that after the # test there is less data in the pool than before the test. if added_data < 0: added_data = 0 ratio = 1 log.info( "The ratio between capacity before and after the test " f"on {entry} is : {ratio:.2f} ; {added_data:,.2f} GiB" ) check = (ratio < 1.20) or (added_data < 3) errmsg = f"{entry} is over 20% (or 3 GiB) larger [{ratio} ; {added_data}]-- possible leak" assert check, errmsg
def test_upgrade_ocp(self, reduce_and_resume_cluster_load): """ Tests OCS stability when upgrading OCP """ cluster_ver = ocp.run_cmd("oc get clusterversions/version -o yaml") logger.debug(f"Cluster versions before upgrade:\n{cluster_ver}") ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): ocp_channel = config.UPGRADE.get("ocp_channel", ocp.get_ocp_upgrade_channel()) ocp_upgrade_version = config.UPGRADE.get("ocp_upgrade_version") if not ocp_upgrade_version: ocp_upgrade_version = get_latest_ocp_version( channel=ocp_channel) ocp_arch = config.UPGRADE["ocp_arch"] target_image = f"{ocp_upgrade_version}-{ocp_arch}" elif ocp_upgrade_version.endswith(".nightly"): target_image = expose_ocp_version(ocp_upgrade_version) logger.info(f"Target image; {target_image}") image_path = config.UPGRADE["ocp_upgrade_path"] cluster_operators = ocp.get_all_cluster_operators() logger.info(f" oc version: {ocp.get_current_oc_version()}") # Verify Upgrade subscription channel: ocp.patch_ocp_upgrade_channel(ocp_channel) for sampler in TimeoutSampler( timeout=250, sleep=15, func=ocp.verify_ocp_upgrade_channel, channel_variable=ocp_channel, ): if sampler: logger.info(f"OCP Channel:{ocp_channel}") break # Upgrade OCP logger.info(f"full upgrade path: {image_path}:{target_image}") ocp.upgrade_ocp(image=target_image, image_path=image_path) # Wait for upgrade for ocp_operator in cluster_operators: logger.info(f"Checking upgrade status of {ocp_operator}:") # ############ Workaround for issue 2624 ####### name_changed_between_versions = ( "service-catalog-apiserver", "service-catalog-controller-manager", ) if ocp_operator in name_changed_between_versions: logger.info(f"{ocp_operator} upgrade will not be verified") continue # ############ End of Workaround ############### ver = ocp.get_cluster_operator_version(ocp_operator) logger.info(f"current {ocp_operator} version: {ver}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.confirm_cluster_operator_version, target_version=target_image, cluster_operator=ocp_operator, ): if sampler: logger.info(f"{ocp_operator} upgrade completed!") break else: logger.info( f"{ocp_operator} upgrade did not completed yet!") # post upgrade validation: check cluster operator status cluster_operators = ocp.get_all_cluster_operators() for ocp_operator in cluster_operators: logger.info(f"Checking cluster status of {ocp_operator}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.verify_cluster_operator_status, cluster_operator=ocp_operator, ): if sampler: break else: logger.info(f"{ocp_operator} status is not valid") # Post upgrade validation: check cluster version status logger.info("Checking clusterversion status") for sampler in TimeoutSampler( timeout=900, sleep=15, func=ocp.validate_cluster_version_status): if sampler: logger.info("Upgrade Completed Successfully!") break cluster_ver = ocp.run_cmd("oc get clusterversions/version -o yaml") logger.debug(f"Cluster versions post upgrade:\n{cluster_ver}") # load new config file self.load_ocp_version_config_file(ocp_upgrade_version) new_ceph_cluster = CephCluster() new_ceph_cluster.wait_for_rebalance(timeout=1800) ceph_health_check(tries=90, delay=30)
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs' # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml es_server = "" es_port = "" if 'elasticsearch' in fio_cr['spec']: if 'server' in fio_cr['spec']['elasticsearch']: es_server = fio_cr['spec']['elasticsearch']['server'] if 'port' in fio_cr['spec']['elasticsearch']: es_port = fio_cr['spec']['elasticsearch']['port'] else: fio_cr['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test fio_cr['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } # Setting the data set to 40% of the total storage capacity but # not more then 600GiB ceph_cluster = CephCluster() total_data_set = int(ceph_cluster.get_ceph_capacity() * 0.4) filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace( 'GiB', '')) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_cluster.get_ceph_capacity() * 0.008) fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB' # make sure that the storage size is larger then the file size fio_cr['spec']['workload']['args'][ 'storagesize'] = f'{int(filesize * 1.2)}Gi' fio_cr['spec']['workload']['args']['servers'] = int(total_data_set / filesize) log.info(f'Total Data set to work on is : {total_data_set} GiB') fio_cr['spec']['clustername'] = config.ENV_DATA[ 'platform'] + get_build() + get_ocs_version() fio_cr['spec']['test_user'] = get_ocs_version( ) + interface + io_pattern fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] log.info(f'fio_cr: {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, 'fio-client', constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) output = run_cmd(f'oc logs {fio_client_pod}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() # Setting back the original elastic-search information fio_cr['spec']['elasticsearch'] = { 'server': es_server, 'port': es_port } analyze_regression(io_pattern, sc, es_username=fio_cr['spec']['test_user'])
def test_add_capacity_with_resource_delete( self, workload_storageutilization_rbd, resource_name, resource_id, is_kill_resource_repeatedly, ): """ The function get the resource name, and id. The function adds capacity to the cluster, and then delete the resource while storage capacity is getting increased. Args: resource_name (str): the name of the resource to delete resource_id (int): the id of the resource to delete is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False delete the resource only once. """ used_percentage = get_percent_used_capacity() logging.info( f"storageutilization is completed. used capacity = {used_percentage}" ) osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= constants.MAX_OSDS: pytest.skip("We have maximum of OSDs in the cluster") d = Disruptions() d.set_resource(resource_name) self.new_pods_in_status_running = False osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # Wait for new osd's to come up. After the first new osd in status Init - delete the resource. # After deleting the resource we expect that all the new osd's will be in status running, # and the delete resource will be also in status running. pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info( f"Delete a {resource_name} pod while storage capacity is getting increased" ) if is_kill_resource_repeatedly: with ThreadPoolExecutor() as executor: executor.submit(self.kill_resource_repeatedly, resource_name, resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) else: d.delete_resource(resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) self.new_pods_in_status_running = True logging.info( "Finished verifying add capacity when one of the pods gets deleted" ) logging.info("Waiting for ceph health check to finished...") ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=90) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=1800), "Data re-balance failed to complete"
class Sanity: """ Class for cluster health and functional validations """ def __init__(self): """ Initializer for Sanity class - Init CephCluster() in order to set the cluster status before starting the tests """ self.pvc_objs = list() self.pod_objs = list() self.ceph_cluster = CephCluster() def health_check(self, cluster_check=True, tries=20): """ Perform Ceph and cluster health checks """ wait_for_cluster_connectivity(tries=400) logger.info("Checking cluster and Ceph health") node.wait_for_nodes_status(timeout=300) ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=tries) if cluster_check: self.ceph_cluster.cluster_health_check(timeout=60) def create_resources(self, pvc_factory, pod_factory, run_io=True): """ Sanity validation - Create resources (FS and RBD) and run IO Args: pvc_factory (function): A call to pvc_factory function pod_factory (function): A call to pod_factory function run_io (bool): True for run IO, False otherwise """ logger.info("Creating resources and running IO as a sanity functional validation") for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]: pvc_obj = pvc_factory(interface) self.pvc_objs.append(pvc_obj) self.pod_objs.append(pod_factory(pvc=pvc_obj, interface=interface)) if run_io: for pod in self.pod_objs: pod.run_io('fs', '1G') for pod in self.pod_objs: get_fio_rw_iops(pod) def delete_resources(self): """ Sanity validation - Delete resources (FS and RBD) """ logger.info("Deleting resources as a sanity functional validation") for pod_obj in self.pod_objs: pod_obj.delete() for pod_obj in self.pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) for pvc_obj in self.pvc_objs: pvc_obj.delete() for pvc_obj in self.pvc_objs: pvc_obj.ocp.wait_for_delete(pvc_obj.name)
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml") if interface == "CephBlockPool": sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if "elasticsearch" in fio_cr["spec"]: backup_es = fio_cr["spec"]["elasticsearch"] else: log.warning( "Elastic Search information does not exists in YAML file") fio_cr["spec"]["elasticsearch"] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr["spec"]["elasticsearch"] = { "server": es.get_ip(), "port": es.get_port(), } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB" # make sure that the storage size is larger then the file size fio_cr["spec"]["workload"]["args"][ "storagesize"] = f"{int(filesize * 1.2)}Gi" fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set / filesize) log.info(f"Total Data set to work on is : {total_data_set} GiB") environment = get_environment_info() if not environment["user"] == "": fio_cr["spec"]["test_user"] = environment["user"] fio_cr["spec"]["clustername"] = environment["clustername"] log.debug(f"Environment information is : {environment}") fio_cr["spec"]["workload"]["args"]["storageclass"] = sc if io_pattern == "sequential": fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"] fio_cr["spec"]["workload"]["args"]["iodepth"] = 1 log.info(f"The FIO CR file is {fio_cr}") fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, "fio-client", constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr["spec"]["elasticsearch"] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key("io_pattern", io_pattern) full_results.add_key("dataset", f"{total_data_set}GiB") full_results.add_key("file_size", fio_cr["spec"]["workload"]["args"]["filesize"]) full_results.add_key("servers", fio_cr["spec"]["workload"]["args"]["servers"]) full_results.add_key("samples", fio_cr["spec"]["workload"]["args"]["samples"]) full_results.add_key("operations", fio_cr["spec"]["workload"]["args"]["jobs"]) full_results.add_key("block_sizes", fio_cr["spec"]["workload"]["args"]["bs"]) full_results.add_key("io_depth", fio_cr["spec"]["workload"]["args"]["iodepth"]) full_results.add_key("jobs", fio_cr["spec"]["workload"]["args"]["numjobs"]) full_results.add_key( "runtime", { "read": fio_cr["spec"]["workload"]["args"]["read_runtime"], "write": fio_cr["spec"]["workload"]["args"]["write_runtime"], }, ) full_results.add_key( "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"]) full_results.add_key("vol_size", fio_cr["spec"]["workload"]["args"]["storagesize"]) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) full_results.add_key("test_time", { "start": start_time, "end": end_time }) output = run_cmd(f"oc logs {fio_client_pod}") log.info(f"The Test log is : {output}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f"Full results is : {full_results.results}") # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info("Copy all data from Internal ES to Main ES") es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f"The Result can be found at ; {full_results.results_link()}")
def setup( self, request, scenario, num_of_nodes, num_of_fail_nodes, disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory ): """ Identify the nodes and start DeploymentConfig based app pods using PVC with ReadWriteOnce (RWO) access mode on selected nodes Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test num_of_fail_nodes (int): number of nodes to make unresponsive during test disrupt_provisioner (bool): True to disrupt the leader provisioner pods if not running on selected nodes, else False project_factory: A fixture to create new project multi_pvc_factory: A fixture create a set of new PVCs dc_pod_factory: A fixture to create deploymentconfig pods Returns: tuple: containing the params used in test cases """ ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes( scenario, num_of_nodes ) test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes logger.info(f"Using nodes {test_nodes} for running test") def finalizer(): helpers.remove_label_from_worker_node( node_list=test_nodes, label_key="nodetype" ) request.addfinalizer(finalizer) if len(ocs_nodes) > 4 and float(config.ENV_DATA['ocs_version']) >= 4.3: pod_obj = ocp.OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) assert pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=5, timeout=900 ) ceph_cluster = CephCluster() project = project_factory() # Select nodes for running app pods and inducing network failure later app_pod_nodes = self.select_nodes_for_app_pods( scenario, ceph_cluster, ocs_nodes, non_ocs_nodes, num_of_fail_nodes ) # Create multiple RBD and CephFS backed PVCs with RWO accessmode num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes rbd_pvcs = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs ) cephfs_pvcs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs ) # Create deploymentconfig based pods dc_pods = [] # Start app-pods on selected node(s) for node_name in app_pod_nodes: logger.info(f"Starting app pods on the node {node_name}") helpers.label_worker_node( node_list=[node_name], label_key="nodetype", label_value="app-pod" ) for num in range(self.num_of_app_pods_per_node): dc_pods.append( dc_pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0), node_selector={'nodetype': 'app-pod'} ) ) assert pod.verify_node_name(dc_pods[-1], node_name), ( f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" ) dc_pods.append( dc_pod_factory( interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0), node_selector={'nodetype': 'app-pod'} ) ) assert pod.verify_node_name(dc_pods[-1], node_name), ( f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" ) helpers.remove_label_from_worker_node( node_list=[node_name], label_key="nodetype" ) # Label other test nodes to be able to run app pods later helpers.label_worker_node( node_list=test_nodes, label_key="nodetype", label_value="app-pod" ) # Get ceph mon,osd pods running on selected node if colocated scenario # and extra OCS nodes are present ceph_pods = [] if scenario == "colocated" and len(test_nodes) > len(ceph_cluster.osds): pods_to_check = ceph_cluster.osds # Skip mon pods if mon_count is 5 as there may not be enough nodes # for all mons to run after multiple node failures if ceph_cluster.mon_count == 3: pods_to_check.extend(ceph_cluster.mons) for pod_obj in pods_to_check: if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]: ceph_pods.append(pod_obj) logger.info( f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}" ) disruptor = [] if disrupt_provisioner: disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes) return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
def run_ocs_upgrade(operation=None, *operation_args, **operation_kwargs): """ Run upgrade procedure of OCS cluster Args: operation: (function): Function to run operation_args: (iterable): Function's arguments operation_kwargs: (map): Function's keyword arguments """ ceph_cluster = CephCluster() upgrade_ocs = OCSUpgrade( namespace=config.ENV_DATA["cluster_namespace"], version_before_upgrade=config.ENV_DATA.get("ocs_version"), ocs_registry_image=config.UPGRADE.get("upgrade_ocs_registry_image"), upgrade_in_current_source=config.UPGRADE.get( "upgrade_in_current_source", False), ) upgrade_version = upgrade_ocs.get_upgrade_version() assert ( upgrade_ocs.get_parsed_versions()[1] >= upgrade_ocs.get_parsed_versions()[0]), ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{upgrade_ocs.version_before_upgrade}") csv_name_pre_upgrade = upgrade_ocs.get_csv_name_pre_upgrade() pre_upgrade_images = upgrade_ocs.get_pre_upgrade_image( csv_name_pre_upgrade) upgrade_ocs.load_version_config_file(upgrade_version) if config.DEPLOYMENT.get("disconnected"): upgrade_ocs.ocs_registry_image = prepare_disconnected_ocs_deployment( upgrade=True) log.info( f"Disconnected upgrade - new image: {upgrade_ocs.ocs_registry_image}" ) with CephHealthMonitor(ceph_cluster): channel = upgrade_ocs.set_upgrade_channel() upgrade_ocs.set_upgrade_images() upgrade_ocs.update_subscription(channel) if operation: log.info(f"Calling test function: {operation}") _ = operation(*operation_args, **operation_kwargs) # Workaround for issue #2531 time.sleep(30) # End of workaround for sample in TimeoutSampler( timeout=725, sleep=5, func=upgrade_ocs.check_if_upgrade_completed, channel=channel, csv_name_pre_upgrade=csv_name_pre_upgrade, ): try: if sample: log.info("Upgrade success!") break except TimeoutException: raise TimeoutException("No new CSV found after upgrade!") old_image = upgrade_ocs.get_images_post_upgrade( channel, pre_upgrade_images, upgrade_version) verify_image_versions( old_image, upgrade_ocs.get_parsed_versions()[1], upgrade_ocs.version_before_upgrade, ) ocs_install_verification( timeout=600, skip_osd_distribution_check=True, ocs_registry_image=upgrade_ocs.ocs_registry_image, post_upgrade_verification=True, version_before_upgrade=upgrade_ocs.version_before_upgrade, )
def test_pvc_multiple_clone_performance( self, interface_iterate, teardown_factory, storageclass_factory, pvc_factory, pod_factory, ): """ 1. Creating PVC PVC size is calculated in the test and depends on the storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 70% of data 3. Take a clone of the PVC and measure time and speed of creation by reading start creation and end creation times from relevant logs 4. Repeat the previous step number of times (maximal num_of_clones is 512) 5. Print all measured statistics for all the clones. Raises: StorageNotSufficientException: in case of not enough capacity on the cluster """ num_of_clones = 512 # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = int(ceph_cluster.get_ceph_capacity()) # Use 70% of the storage capacity in the test capacity_to_use = int(ceph_capacity * 0.7) # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB need_capacity = int((num_of_clones + 2) * 1.35) # Test will run only on system with enough capacity if capacity_to_use < need_capacity: err_msg = (f"The system have only {ceph_capacity} GiB, " f"we want to use only {capacity_to_use} GiB, " f"and we need {need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB pvc_size = int(capacity_to_use / (num_of_clones + 2)) self.interface = interface_iterate self.sc_obj = storageclass_factory(self.interface) self.pvc_obj = pvc_factory(interface=self.interface, size=pvc_size, status=constants.STATUS_BOUND) self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING) # Calculating the file size as 70% of the PVC size filesize = self.pvc_obj.size * 0.70 # Change the file size to MB for the FIO function file_size = f"{int(filesize * constants.GB2MB)}M" file_name = self.pod_obj.name log.info(f"Total capacity size is : {ceph_capacity} GiB, " f"Going to use {need_capacity} GiB, " f"With {num_of_clones} clones to {pvc_size} GiB PVC. " f"File size to be written is : {file_size} " f"with the name of {file_name}") self.params = {} self.params["clonenum"] = f"{num_of_clones}" self.params["filesize"] = file_size self.params["ERRMSG"] = "Error in command" clone_yaml = self.build_params() performance_lib.write_fio_on_pod(self.pod_obj, file_size) # Running the test results = [] for test_num in range(1, int(self.params["clonenum"]) + 1): log.info(f"Starting test number {test_num}") ct = self.create_clone(test_num, clone_yaml) speed = self.params["datasize"] / ct results.append({"Clone Num": test_num, "time": ct, "speed": speed}) log.info( f"Results for clone number {test_num} are : " f"Creation time is {ct} secs, Creation speed {speed} MB/sec") for r in results: log.info( f"Clone number {r['Clone Num']} creation time is {r['time']} secs." ) log.info( f"Clone number {r['Clone Num']} creation speed is {r['speed']} MB/sec." )
def test_upgrade(): ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): namespace = config.ENV_DATA['cluster_namespace'] version_before_upgrade = config.ENV_DATA.get("ocs_version") upgrade_version = config.UPGRADE.get("upgrade_ocs_version", version_before_upgrade) ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image') if ocs_registry_image: upgrade_version = get_ocs_version_from_image(ocs_registry_image) parsed_version_before_upgrade = parse_version(version_before_upgrade) parsed_upgrade_version = parse_version(upgrade_version) assert parsed_upgrade_version >= parsed_version_before_upgrade, ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{version_before_upgrade}") operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) channel = config.DEPLOYMENT.get('ocs_csv_channel') csv_name_pre_upgrade = package_manifest.get_current_csv(channel) log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}") csv_pre_upgrade = CSV(resource_name=csv_name_pre_upgrade, namespace=namespace) pre_upgrade_images = get_images(csv_pre_upgrade.get()) version_change = parsed_upgrade_version > parsed_version_before_upgrade if version_change: version_config_file = os.path.join(constants.CONF_DIR, 'ocs_version', f'ocs-{upgrade_version}.yaml') load_config_file(version_config_file) ocs_catalog = CatalogSource( resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME, namespace=constants.MARKETPLACE_NAMESPACE, ) upgrade_in_current_source = config.UPGRADE.get( 'upgrade_in_current_source', False) if not upgrade_in_current_source: if not ocs_catalog.is_exist() and not upgrade_in_current_source: log.info("OCS catalog source doesn't exist. Creating new one.") create_catalog_source(ocs_registry_image, ignore_upgrade=True) image_url = ocs_catalog.get_image_url() image_tag = ocs_catalog.get_image_name() log.info(f"Current image is: {image_url}, tag: {image_tag}") if ocs_registry_image: image_url, new_image_tag = ocs_registry_image.split(':') elif (config.UPGRADE.get('upgrade_to_latest', True) or version_change): new_image_tag = get_latest_ds_olm_tag() else: new_image_tag = get_next_version_available_for_upgrade( image_tag) cs_data = deepcopy(ocs_catalog.data) image_for_upgrade = ':'.join([image_url, new_image_tag]) log.info(f"Image: {image_for_upgrade} will be used for upgrade.") cs_data['spec']['image'] = image_for_upgrade with NamedTemporaryFile() as cs_yaml: dump_data_to_temp_yaml(cs_data, cs_yaml.name) ocs_catalog.apply(cs_yaml.name) # Wait for the new package manifest for upgrade. operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) package_manifest.wait_for_resource() channel = config.DEPLOYMENT.get('ocs_csv_channel') if not channel: channel = package_manifest.get_default_channel() # update subscription subscription = OCP( resource_name=constants.OCS_SUBSCRIPTION, kind='subscription', namespace=config.ENV_DATA['cluster_namespace'], ) current_ocs_source = subscription.data['spec']['source'] log.info(f"Current OCS subscription source: {current_ocs_source}") ocs_source = current_ocs_source if upgrade_in_current_source else ( constants.OPERATOR_CATALOG_SOURCE_NAME) patch_subscription_cmd = ( f'oc patch subscription {constants.OCS_SUBSCRIPTION} ' f'-n {namespace} --type merge -p \'{{"spec":{{"channel": ' f'"{channel}", "source": "{ocs_source}"}}}}\'') run_cmd(patch_subscription_cmd) subscription_plan_approval = config.DEPLOYMENT.get( 'subscription_plan_approval') if subscription_plan_approval == 'Manual': wait_for_install_plan_and_approve(namespace) attempts = 145 for attempt in range(1, attempts + 1): log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.") csv_name_post_upgrade = package_manifest.get_current_csv(channel) if csv_name_post_upgrade == csv_name_pre_upgrade: log.info(f"CSV is still: {csv_name_post_upgrade}") sleep(5) else: log.info(f"CSV now upgraded to: {csv_name_post_upgrade}") break if attempts == attempt: raise TimeoutException("No new CSV found after upgrade!") csv_post_upgrade = CSV(resource_name=csv_name_post_upgrade, namespace=namespace) log.info( f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state" ) if version_before_upgrade == '4.2' and upgrade_version == '4.3': log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3") setup_ceph_toolbox(force_setup=True) csv_post_upgrade.wait_for_phase("Succeeded", timeout=600) post_upgrade_images = get_images(csv_post_upgrade.get()) old_images, _, _ = get_upgrade_image_info(pre_upgrade_images, post_upgrade_images) verify_image_versions(old_images, parsed_upgrade_version) ocs_install_verification( timeout=600, skip_osd_distribution_check=True, ocs_registry_image=ocs_registry_image, post_upgrade_verification=True, )
def test_pvc_snapshot_performance(self, pvc_size): """ 1. Run I/O on a pod file 2. Calculate md5sum of the file 3. Take a snapshot of the PVC 4. Measure the total snapshot creation time and the CSI snapshot creation time 4. Restore From the snapshot and measure the time 5. Attach a new pod to it 6. Verify that the file is present on the new pod also 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod This scenario run 3 times and report all the average results of the 3 runs and will send them to the ES Args: pvc_size: the size of the PVC to be tested - parametrize """ # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total capacity size is : {ceph_capacity}") log.info(f"PVC Size is : {pvc_size}") log.info(f"Needed capacity is {int(int(pvc_size) * 5)}") if int(ceph_capacity) < int(pvc_size) * 5: log.error( f"PVC size is {pvc_size}GiB and it is too large for this system" f" which have only {ceph_capacity}GiB") return # Calculating the file size as 25% of the PVC size # in the end the PVC will be 75% full filesize = self.pvc_obj.size * 0.25 # Change the file size to MB and from int to str file_size = f"{int(filesize * 1024)}M" all_results = [] self.results_path = get_full_test_logs_path(cname=self) log.info(f"Logs file path name is : {self.full_log_path}") # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_snapshot_perf", )) self.full_results.add_key("pvc_size", pvc_size + " GiB") self.full_results.add_key("interface", self.sc) self.full_results.all_results["creation_time"] = [] self.full_results.all_results["csi_creation_time"] = [] self.full_results.all_results["creation_speed"] = [] self.full_results.all_results["restore_time"] = [] self.full_results.all_results["restore_speed"] = [] self.full_results.all_results["restore_csi_time"] = [] for test_num in range(self.tests_numbers): test_results = { "test_num": test_num + 1, "dataset": (test_num + 1) * filesize * 1024, # size in MiB "create": { "time": None, "csi_time": None, "speed": None }, "restore": { "time": None, "speed": None }, } log.info(f"Starting test phase number {test_num}") # Step 1. Run I/O on a pod file. file_name = f"{self.pod_object.name}-{test_num}" log.info(f"Starting IO on the POD {self.pod_object.name}") # Going to run only write IO to fill the PVC for the snapshot self.pod_object.fillup_fs(size=file_size, fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_object.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {self.pod_object.name}. FIO result: {fio_result}" log.info("IO on the PVC Finished") # Verify presence of the file file_path = pod.get_file_path(self.pod_object, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_object, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_object.name}") # Step 2. Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_object, file_name) # Step 3. Take a snapshot of the PVC and measure the time of creation. snap_name = self.pvc_obj.name.replace("pvc-test", f"snapshot-test{test_num}") log.info(f"Taking snapshot of the PVC {snap_name}") start_time = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") test_results["create"]["time"] = self.measure_create_snapshot_time( pvc_name=self.pvc_obj.name, snap_name=snap_name, namespace=self.pod_object.namespace, interface=self.interface, start_time=start_time, ) test_results["create"][ "csi_time"] = performance_lib.measure_csi_snapshot_creation_time( interface=self.interface, snapshot_id=self.snap_uid, start_time=start_time, ) test_results["create"]["speed"] = int( test_results["dataset"] / test_results["create"]["time"]) log.info( f' Test {test_num} dataset is {test_results["dataset"]} MiB') log.info( f"Snapshot name {snap_name} and id {self.snap_uid} creation time is" f' : {test_results["create"]["time"]} sec.') log.info( f"Snapshot name {snap_name} and id {self.snap_uid} csi creation time is" f' : {test_results["create"]["csi_time"]} sec.') log.info( f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec' ) # Step 4. Restore the PVC from the snapshot and measure the time # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace log.info("Restoring from the Snapshot") restore_pvc_name = self.pvc_obj.name.replace( "pvc-test", f"restore-pvc{test_num}") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if self.interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML csi_start_time = self.get_time("csi") log.info("Restoring the PVC from Snapshot") restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=self.snap_obj.name, namespace=self.snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state( restore_pvc_obj, constants.STATUS_BOUND, timeout=3600 # setting this to 60 Min. # since it can be take long time to restore, and we want it to finished. ) restore_pvc_obj.reload() log.info("PVC was restored from the snapshot") test_results["restore"][ "time"] = helpers.measure_pvc_creation_time( self.interface, restore_pvc_obj.name) test_results["restore"]["speed"] = int( test_results["dataset"] / test_results["restore"]["time"]) log.info( f'Snapshot restore time is : {test_results["restore"]["time"]}' ) log.info( f'restore speed is : {test_results["restore"]["speed"]} MB/sec' ) test_results["restore"][ "csi_time"] = performance_lib.csi_pvc_time_measure( self.interface, restore_pvc_obj, "create", csi_start_time) log.info( f'Snapshot csi restore time is : {test_results["restore"]["csi_time"]}' ) # Step 5. Attach a new pod to the restored PVC restore_pod_object = helpers.create_pod( interface_type=self.interface, pvc_name=restore_pvc_obj.name, namespace=self.snap_obj.namespace, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_object, state=constants.STATUS_RUNNING) restore_pod_object.reload() # Step 6. Verify that the file is present on the new pod also. log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_object.name}") assert pod.check_file_existence( restore_pod_object, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_object.name}") # Step 7. Verify that the md5sum matches log.info( f"Verifying that md5sum of {file_name} " f"on pod {self.pod_object.name} matches with md5sum " f"of the same file on restore pod {restore_pod_object.name}") assert pod.verify_data_integrity( restore_pod_object, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") restore_pod_object.delete() restore_pvc_obj.delete() all_results.append(test_results) # clean the enviroment self.pod_object.delete() self.pvc_obj.delete() self.delete_test_project() # logging the test summary, all info in one place for easy log reading c_speed, c_runtime, c_csi_runtime, r_speed, r_runtime, r_csi_runtime = ( 0 for i in range(6)) log.info("Test summary :") for tst in all_results: c_speed += tst["create"]["speed"] c_runtime += tst["create"]["time"] c_csi_runtime += tst["create"]["csi_time"] r_speed += tst["restore"]["speed"] r_runtime += tst["restore"]["time"] r_csi_runtime += tst["restore"]["csi_time"] self.full_results.all_results["creation_time"].append( tst["create"]["time"]) self.full_results.all_results["csi_creation_time"].append( tst["create"]["csi_time"]) self.full_results.all_results["creation_speed"].append( tst["create"]["speed"]) self.full_results.all_results["restore_time"].append( tst["restore"]["time"]) self.full_results.all_results["restore_speed"].append( tst["restore"]["speed"]) self.full_results.all_results["restore_csi_time"].append( tst["restore"]["csi_time"]) self.full_results.all_results["dataset_inMiB"] = tst["dataset"] log.info( f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. " f"Take snapshot time is {tst['create']['time']} " f"at {tst['create']['speed']} MiB/Sec " f"Restore from snapshot time is {tst['restore']['time']} " f"at {tst['restore']['speed']} MiB/Sec ") avg_snap_c_time = c_runtime / self.tests_numbers avg_snap_csi_c_time = c_csi_runtime / self.tests_numbers avg_snap_c_speed = c_speed / self.tests_numbers avg_snap_r_time = r_runtime / self.tests_numbers avg_snap_r_speed = r_speed / self.tests_numbers avg_snap_r_csi_time = r_csi_runtime / self.tests_numbers log.info(f" Average snapshot creation time is {avg_snap_c_time} sec.") log.info( f" Average csi snapshot creation time is {avg_snap_csi_c_time} sec." ) log.info( f" Average snapshot creation speed is {avg_snap_c_speed} MiB/sec") log.info(f" Average snapshot restore time is {avg_snap_r_time} sec.") log.info( f" Average snapshot restore speed is {avg_snap_r_speed} MiB/sec") log.info( f" Average snapshot restore csi time is {avg_snap_r_csi_time} sec." ) self.full_results.add_key("avg_snap_creation_time_insecs", avg_snap_c_time) self.full_results.add_key("avg_snap_csi_creation_time_insecs", avg_snap_csi_c_time) self.full_results.add_key("avg_snap_creation_speed", avg_snap_c_speed) self.full_results.add_key("avg_snap_restore_time_insecs", avg_snap_r_time) self.full_results.add_key("avg_snap_restore_speed", avg_snap_r_speed) self.full_results.add_key("avg_snap_restore_csi_time_insecs", avg_snap_r_csi_time) # Write the test results into the ES server log.info("writing results to elastic search server") if self.full_results.es_write(): res_link = self.full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") self.write_result_to_file(res_link)
def test_pvc_multiple_snapshot_performance( self, interface_iterate, teardown_factory, storageclass_factory, pvc_factory, pod_factory, ): """ 1. Creating PVC size is depend on storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 80% of data 3. Take a snapshot of the PVC and measure the time of creation. 4. re-write the data on the PVC 5. Take a snapshot of the PVC and measure the time of creation. 6. repeat steps 4-5 the numbers of snapshot we want to take : 512 this will be run by outside script for low memory consumption 7. print all information. Raises: StorageNotSufficientException: in case of not enough capacity """ # Number od snapshot for CephFS is 100 and for RBD is 512 num_of_snaps = 100 if interface_iterate == constants.CEPHBLOCKPOOL: num_of_snaps = 512 # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = int(ceph_cluster.get_ceph_capacity()) # Use 70% of the storage capacity in the test capacity_to_use = int(ceph_capacity * 0.7) # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB need_capacity = int((num_of_snaps + 2) * 1.35) # Test will run only on system with enough capacity if capacity_to_use < need_capacity: err_msg = (f"The system have only {ceph_capacity} GiB, " f"we want to use only {capacity_to_use} GiB, " f"and we need {need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB pvc_size = int(capacity_to_use / (num_of_snaps + 2)) self.interface = interface_iterate self.sc_obj = storageclass_factory(self.interface) self.pvc_obj = pvc_factory(interface=self.interface, size=pvc_size, status=constants.STATUS_BOUND) self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING) # Calculating the file size as 80% of the PVC size filesize = self.pvc_obj.size * 0.80 # Change the file size to MB for the FIO function file_size = f"{int(filesize * constants.GB2MB)}M" file_name = self.pod_obj.name log.info(f"Total capacity size is : {ceph_capacity} GiB, " f"Going to use {need_capacity} GiB, " f"With {num_of_snaps} Snapshots to {pvc_size} GiB PVC. " f"File size to be written is : {file_size} " f"with the name of {file_name}") os.environ["SNAPNUM"] = f"{num_of_snaps}" os.environ["LOGPATH"] = f"{ocsci_log_path()}" os.environ["FILESIZE"] = file_size os.environ["NSPACE"] = self.pvc_obj.namespace os.environ["PODNAME"] = self.pod_obj.name os.environ["PVCNAME"] = self.pvc_obj.name os.environ["INTERFACE"] = self.interface main_script = "tests/e2e/performance/test_multi_snapshots.py" result = subprocess.run([main_script], stdout=subprocess.PIPE) log.info(f"Results from main script : {result.stdout.decode('utf-8')}") if "All results are" not in result.stdout.decode("utf-8"): log.error("Test did not completed") raise Exception("Test did not completed")
def validate_cluster(self, resources, instances): """ Perform cluster validation - nodes readiness, Ceph cluster health check and functional resources tests """ instances_names = list(instances.values()) assert ocp.wait_for_nodes_ready(instances_names), ( "Not all nodes reached status Ready" ) ceph_cluster = CephCluster() assert ceph_health_check( namespace=config.ENV_DATA['cluster_namespace'] ) ceph_cluster.cluster_health_check(timeout=60) # Create resources and run IO for both FS and RBD # Unpack resources projects, secrets, pools, storageclasses, pvcs, pods = resources[:6] # Project projects.append(helpers.create_project()) # Secrets secrets.append(helpers.create_secret(constants.CEPHBLOCKPOOL)) secrets.append(helpers.create_secret(constants.CEPHFILESYSTEM)) # Pools pools.append(helpers.create_ceph_block_pool()) pools.append(helpers.get_cephfs_data_pool_name()) # Storageclasses storageclasses.append( helpers.create_storage_class( interface_type=constants.CEPHBLOCKPOOL, interface_name=pools[0].name, secret_name=secrets[0].name ) ) storageclasses.append( helpers.create_storage_class( interface_type=constants.CEPHFILESYSTEM, interface_name=pools[1], secret_name=secrets[1].name ) ) # PVCs pvcs.append(helpers.create_pvc( sc_name=storageclasses[0].name, namespace=projects[0].namespace) ) pvcs.append(helpers.create_pvc( sc_name=storageclasses[1].name, namespace=projects[0].namespace) ) # Pods pods.append( helpers.create_pod( interface_type=constants.CEPHBLOCKPOOL, pvc_name=pvcs[0].name, namespace=projects[0].namespace ) ) pods.append( helpers.create_pod( interface_type=constants.CEPHFILESYSTEM, pvc_name=pvcs[1].name, namespace=projects[0].namespace ) ) # Run IO for pod in pods: pod.run_io('fs', '1G') for pod in pods: fio_result = pod.get_fio_results() logger.info(f"IOPs after FIO for pod {pod.name}:") logger.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}" ) logger.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}" )
def test_pvc_snapshot_performance(self, teardown_factory, pvc_size): """ 1. Run I/O on a pod file. 2. Calculate md5sum of the file. 3. Take a snapshot of the PVC and measure the time of creation. 4. Restore From the snapshot and measure the time 5. Attach a new pod to it. 6. Verify that the file is present on the new pod also. 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod. This scenario run 3 times and report all results Args: teardown_factory: A fixture to destroy objects pvc_size: the size of the PVC to be tested - parametrize """ # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total capacity size is : {ceph_capacity}") log.info(f"PVC Size is : {pvc_size}") log.info(f"Needed capacity is {int(int(pvc_size) * 5)}") if int(ceph_capacity) < int(pvc_size) * 5: log.error( f"PVC size is {pvc_size}GiB and it is too large for this system" f" which have only {ceph_capacity}GiB") return # Calculating the file size as 25% of the PVC size # in the end the PVC will be 75% full filesize = self.pvc_obj.size * 0.25 # Change the file size to MB and from int to str file_size = f"{int(filesize * 1024)}M" all_results = [] for test_num in range(self.tests_numbers): test_results = { "test_num": test_num + 1, "dataset": (test_num + 1) * filesize * 1024, # size in MiB "create": { "time": None, "speed": None }, "restore": { "time": None, "speed": None }, } log.info(f"Starting test phase number {test_num}") # Step 1. Run I/O on a pod file. file_name = f"{self.pod_obj.name}-{test_num}" log.info(f"Starting IO on the POD {self.pod_obj.name}") # Going to run only write IO to fill the PVC for the snapshot self.pod_obj.fillup_fs(size=file_size, fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {self.pod_obj.name}. FIO result: {fio_result}" log.info("IO on the PVC Finished") # Verify presence of the file file_path = pod.get_file_path(self.pod_obj, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_obj.name}") # Step 2. Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name) # Step 3. Take a snapshot of the PVC and measure the time of creation. snap_name = self.pvc_obj.name.replace("pvc-test", f"snapshot-test{test_num}") log.info(f"Taking snapshot of the PVC {snap_name}") test_results["create"]["time"] = self.measure_create_snapshot_time( pvc_name=self.pvc_obj.name, snap_name=snap_name, interface=self.interface, ) test_results["create"]["speed"] = int( test_results["dataset"] / test_results["create"]["time"]) log.info( f' Test {test_num} dataset is {test_results["dataset"]} MiB') log.info( f'Snapshot creation time is : {test_results["create"]["time"]} sec.' ) log.info( f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec' ) # Step 4. Restore the PVC from the snapshot and measure the time # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace log.info("Restoring from the Snapshot") restore_pvc_name = self.pvc_obj.name.replace( "pvc-test", f"restore-pvc{test_num}") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if self.interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML log.info("Resorting the PVC from Snapshot") restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=self.snap_obj.name, namespace=self.snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state( restore_pvc_obj, constants.STATUS_BOUND, timeout=3600 # setting this to 60 Min. # since it can be take long time to restore, and we want it to finished. ) teardown_factory(restore_pvc_obj) restore_pvc_obj.reload() log.info("PVC was restored from the snapshot") test_results["restore"][ "time"] = helpers.measure_pvc_creation_time( self.interface, restore_pvc_obj.name) test_results["restore"]["speed"] = int( test_results["dataset"] / test_results["restore"]["time"]) log.info( f'Snapshot restore time is : {test_results["restore"]["time"]}' ) log.info( f'restore sped is : {test_results["restore"]["speed"]} MB/sec') # Step 5. Attach a new pod to the restored PVC restore_pod_obj = helpers.create_pod( interface_type=self.interface, pvc_name=restore_pvc_obj.name, namespace=self.snap_obj.namespace, pod_dict_path=constants.NGINX_POD_YAML, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_obj, state=constants.STATUS_RUNNING) teardown_factory(restore_pod_obj) restore_pod_obj.reload() # Step 6. Verify that the file is present on the new pod also. log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_obj.name}") assert pod.check_file_existence( restore_pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_obj.name}") # Step 7. Verify that the md5sum matches log.info(f"Verifying that md5sum of {file_name} " f"on pod {self.pod_obj.name} matches with md5sum " f"of the same file on restore pod {restore_pod_obj.name}") assert pod.verify_data_integrity( restore_pod_obj, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") all_results.append(test_results) # logging the test summery, all info in one place for easy log reading c_speed, c_runtime, r_speed, r_runtime = (0 for i in range(4)) log.info("Test summery :") for tst in all_results: c_speed += tst["create"]["speed"] c_runtime += tst["create"]["time"] r_speed += tst["restore"]["speed"] r_runtime += tst["restore"]["time"] log.info( f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. " f"Take snapshot time is {tst['create']['time']} " f"at {tst['create']['speed']} MiB/Sec " f"Restore from snapshot time is {tst['restore']['time']} " f"at {tst['restore']['speed']} MiB/Sec ") log.info( f" Average snapshot creation time is {c_runtime / self.tests_numbers} sec." ) log.info( f" Average snapshot creation speed is {c_speed / self.tests_numbers} MiB/sec" ) log.info( f" Average snapshot restore time is {r_runtime / self.tests_numbers} sec." ) log.info( f" Average snapshot restore speed is {r_speed / self.tests_numbers} MiB/sec" )
def run_ocs_upgrade(operation=None, *operation_args, **operation_kwargs): """ Run upgrade procedure of OCS cluster Args: operation: (function): Function to run operation_args: (iterable): Function's arguments operation_kwargs: (map): Function's keyword arguments """ ceph_cluster = CephCluster() original_ocs_version = config.ENV_DATA.get("ocs_version") upgrade_in_current_source = config.UPGRADE.get("upgrade_in_current_source", False) upgrade_ocs = OCSUpgrade( namespace=config.ENV_DATA["cluster_namespace"], version_before_upgrade=original_ocs_version, ocs_registry_image=config.UPGRADE.get("upgrade_ocs_registry_image"), upgrade_in_current_source=upgrade_in_current_source, ) upgrade_version = upgrade_ocs.get_upgrade_version() assert ( upgrade_ocs.get_parsed_versions()[1] >= upgrade_ocs.get_parsed_versions()[0]), ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{upgrade_ocs.version_before_upgrade}") # create external cluster object if config.DEPLOYMENT["external_mode"]: host = config.EXTERNAL_MODE["external_cluster_node_roles"]["node1"][ "ip_address"] user = config.EXTERNAL_MODE["login"]["username"] password = config.EXTERNAL_MODE["login"]["password"] external_cluster = ExternalCluster(host, user, password) # For external cluster , create the secrets if upgraded version is 4.8 if (config.DEPLOYMENT["external_mode"] and original_ocs_version == "4.7" and upgrade_version == "4.8"): external_cluster.create_object_store_user() access_key = config.EXTERNAL_MODE.get("access_key_rgw-admin-ops-user", "") secret_key = config.EXTERNAL_MODE.get("secret_key_rgw-admin-ops-user", "") if not (access_key and secret_key): raise ExternalClusterRGWAdminOpsUserException( "Access and secret key for rgw-admin-ops-user not found") cmd = ( f'oc create secret generic --type="kubernetes.io/rook"' f' "rgw-admin-ops-user" --from-literal=accessKey={access_key} --from-literal=secretKey={secret_key}' ) exec_cmd(cmd) csv_name_pre_upgrade = upgrade_ocs.get_csv_name_pre_upgrade() pre_upgrade_images = upgrade_ocs.get_pre_upgrade_image( csv_name_pre_upgrade) upgrade_ocs.load_version_config_file(upgrade_version) if config.DEPLOYMENT.get("disconnected") and not config.DEPLOYMENT.get( "disconnected_env_skip_image_mirroring"): upgrade_ocs.ocs_registry_image = prepare_disconnected_ocs_deployment( upgrade=True) log.info( f"Disconnected upgrade - new image: {upgrade_ocs.ocs_registry_image}" ) with CephHealthMonitor(ceph_cluster): channel = upgrade_ocs.set_upgrade_channel() upgrade_ocs.set_upgrade_images() ui_upgrade_supported = False if config.UPGRADE.get("ui_upgrade"): if (version.get_semantic_ocp_version_from_config() == version.VERSION_4_9 and original_ocs_version == "4.8" and upgrade_version == "4.9"): ui_upgrade_supported = True else: log.warning( "UI upgrade combination is not supported. It will fallback to CLI upgrade" ) if ui_upgrade_supported: ocs_odf_upgrade_ui() else: if (config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM ) and not (upgrade_in_current_source): create_ocs_secret(config.ENV_DATA["cluster_namespace"]) if upgrade_version != "4.9": # In the case of upgrade to ODF 4.9, the ODF operator should upgrade # OCS automatically. upgrade_ocs.update_subscription(channel) if original_ocs_version == "4.8" and upgrade_version == "4.9": deployment = Deployment() deployment.subscribe_ocs() else: # In the case upgrade is not from 4.8 to 4.9 and we have manual approval strategy # we need to wait and approve install plan, otherwise it's approved in the # subscribe_ocs method. subscription_plan_approval = config.DEPLOYMENT.get( "subscription_plan_approval") if subscription_plan_approval == "Manual": wait_for_install_plan_and_approve( config.ENV_DATA["cluster_namespace"]) if (config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM ) and not (upgrade_in_current_source): for attempt in range(2): # We need to do it twice, because some of the SA are updated # after the first load of OCS pod after upgrade. So we need to # link updated SA again. log.info(f"Sleep 1 minute before attempt: {attempt + 1}/2 " "of linking secret/SAs") time.sleep(60) link_all_sa_and_secret_and_delete_pods( constants.OCS_SECRET, config.ENV_DATA["cluster_namespace"]) if operation: log.info(f"Calling test function: {operation}") _ = operation(*operation_args, **operation_kwargs) # Workaround for issue #2531 time.sleep(30) # End of workaround for sample in TimeoutSampler( timeout=725, sleep=5, func=upgrade_ocs.check_if_upgrade_completed, channel=channel, csv_name_pre_upgrade=csv_name_pre_upgrade, ): try: if sample: log.info("Upgrade success!") break except TimeoutException: raise TimeoutException("No new CSV found after upgrade!") old_image = upgrade_ocs.get_images_post_upgrade( channel, pre_upgrade_images, upgrade_version) verify_image_versions( old_image, upgrade_ocs.get_parsed_versions()[1], upgrade_ocs.version_before_upgrade, ) # update external secrets if config.DEPLOYMENT["external_mode"]: upgrade_version = version.get_semantic_version(upgrade_version, True) if upgrade_version >= version.VERSION_4_10: external_cluster.update_permission_caps() else: external_cluster.update_permission_caps(EXTERNAL_CLUSTER_USER) external_cluster.get_external_cluster_details() # update the external cluster details in secrets log.info("updating external cluster secret") external_cluster_details = NamedTemporaryFile( mode="w+", prefix="external-cluster-details-", delete=False, ) with open(external_cluster_details.name, "w") as fd: decoded_external_cluster_details = decode( config.EXTERNAL_MODE["external_cluster_details"]) fd.write(decoded_external_cluster_details) cmd = ( f"oc set data secret/rook-ceph-external-cluster-details -n {constants.OPENSHIFT_STORAGE_NAMESPACE} " f"--from-file=external_cluster_details={external_cluster_details.name}" ) exec_cmd(cmd) ocs_install_verification( timeout=600, skip_osd_distribution_check=True, ocs_registry_image=upgrade_ocs.ocs_registry_image, post_upgrade_verification=True, version_before_upgrade=upgrade_ocs.version_before_upgrade, )
class TestFullClusterHealth(PASTest): """ Test Cluster health when storage is ~85% """ @pytest.fixture(autouse=True) def setup(self, request, nodes): """ Setting up test parameters """ def teardown(): logger.info("cleanup the environment") nodes.restart_nodes_by_stop_and_start_teardown() request.addfinalizer(teardown) logger.info("Starting the test setup") self.percent_to_fill = 85.0 self.ceph_cluster = CephCluster() self.nodes = None self.benchmark_name = "FIO" self.client_pod_name = "fio-client" self.sanity_helpers = sanity_helpers.Sanity() super(TestFullClusterHealth, self).setup() # deploy the benchmark-operator self.deploy_benchmark_operator() def run(self): """ Run the test, and wait until it finished """ self.deploy_and_wait_for_wl_to_start(timeout=900) self.wait_for_wl_to_finish(sleep=300) try: if "Fio failed to execute" not in self.test_logs: logger.info("FIO has completed successfully") except IOError: logger.warning("FIO failed to complete") def calculate_crd_data(self): """ Getting the storage capacity and calculate pod count and pvc size """ ceph_used_capacity_percent = get_percent_used_capacity() logger.info(f"Ceph used capacity percent is {ceph_used_capacity_percent}%") ceph_capacity = self.ceph_cluster.get_ceph_capacity() logger.info(f"Total storage capacity is {ceph_capacity} GiB") self.percent_to_fill = self.percent_to_fill - ceph_used_capacity_percent logger.info(f"Percentage to fill is {self.percent_to_fill}%") self.total_data_set = int(ceph_capacity * (int(self.percent_to_fill) / 100)) self.filesize = int( self.crd_data["spec"]["workload"]["args"]["filesize"].replace("GiB", "") ) # Make sure that filesize>=10 and servers<=60 self.servers = 60 self.filesize = int(self.total_data_set / self.servers) if self.filesize < 10: self.filesize = 10 self.servers = int(self.total_data_set / self.filesize) self.crd_data["spec"]["workload"]["args"]["filesize"] = f"{self.filesize}GiB" self.crd_data["spec"]["workload"]["args"][ "storagesize" ] = f"{int(self.total_data_set)}Gi" self.crd_data["spec"]["workload"]["args"]["servers"] = self.servers self.crd_data["spec"]["workload"]["args"]["bs"] = "1024KiB" self.crd_data["spec"]["workload"]["args"]["jobs"] = ["write", "read"] self.crd_data["spec"]["workload"]["args"]["iodepth"] = 1 def delete_pods(self): """ Try to delete pods: - Rook operator - OSD - MGR - MON """ pod_list = [] rook_operator_pod = pod.get_ocs_operator_pod( ocs_label=constants.OPERATOR_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_list.append(rook_operator_pod) osd_pods = pod.get_osd_pods() pod_list.extend(osd_pods) mgr_pods = pod.get_mgr_pods() pod_list.extend(mgr_pods) mon_pods = pod.get_mon_pods() pod_list.extend(mon_pods) logger.info(f"Deleting pods: {[p.name for p in pod_list]}") pod.delete_pods(pod_objs=pod_list) def ceph_not_health_error(self): """ Check if Ceph is NOT in "HEALTH_ERR" state Warning state is ok since the cluster is low in storage space Returns: bool: True if Ceph state is NOT "HEALTH_ERR" """ ceph_status = self.ceph_cluster.get_ceph_health() logger.info(f"Ceph status is: {ceph_status}") return ceph_status != "HEALTH_ERR" def mgr_pod_node_restart(self): """ Restart node that runs mgr pod """ mgr_pod_obj = pod.get_mgr_pods() mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) self.nodes.restart_nodes([mgr_node_obj]) wait_for_nodes_status() # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mgr", timeout=600 ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) def restart_ocs_operator_node(self): """ Restart node that runs OCS operator pod """ pod_obj = pod.get_ocs_operator_pod() node_obj = pod.get_pod_node(pod_obj) self.nodes.restart_nodes([node_obj]) wait_for_nodes_status() pod.wait_for_pods_to_be_running( namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name] ) def is_cluster_healthy(self): """ Wrapper function for cluster health check Returns: bool: True if ALL checks passed, False otherwise """ return self.ceph_not_health_error() and pod.wait_for_pods_to_be_running() @system_test @polarion_id("OCS-2749") def test_full_cluster_health( self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory, ): """ Verify that the cluster health is ok when the storage is ~85% full Steps: 1. Deploy benchmark operator and run fio workload 2. Check Ceph health before/after each operation: 2.1 Osd node reboot 2.2 Mgr node reboot 2.3 OCS operator node reboot 2.4 Delete Rook, OSD, MGR & MON pods 2.5 Creation and deletion of resources """ self.nodes = nodes self.full_log_path = get_full_test_logs_path(cname=self) logger.info(f"Logs file path name is : {self.full_log_path}") logger.info("Create resource file for fio workload") self.crd_data = templating.load_yaml(constants.FIO_CR_YAML) self.calculate_crd_data() self.set_storageclass(interface=constants.CEPHBLOCKPOOL) self.run() logger.info("Checking health before disruptive operations") assert self.is_cluster_healthy(), "Cluster is not healthy" osd_node_reboot() logger.info("Checking health after OSD node reboot") assert self.is_cluster_healthy(), "Cluster is not healthy" self.mgr_pod_node_restart() logger.info("Checking health after worker node shutdown") assert self.is_cluster_healthy(), "Cluster is not healthy" self.restart_ocs_operator_node() logger.info("Checking health after OCS operator node restart") assert self.is_cluster_healthy(), "Cluster is not healthy" self.delete_pods() logger.info("Checking health after Rook, OSD, MGR & MON pods deletion") assert self.is_cluster_healthy(), "Cluster is not healthy" # Create resources logger.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources( pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory ) logger.info("Resources Created") # Delete resources logger.info("Deleting resources") self.sanity_helpers.delete_resources() logger.info("Resources Deleted") logger.info( "Checking health after resources creation and deletion using sanity helpers" ) assert self.is_cluster_healthy(), "Cluster is not healthy"
def test_upgrade(): ceph_cluster = CephCluster() ceph_cluster.enable_health_monitor() namespace = config.ENV_DATA['cluster_namespace'] ocs_catalog = CatalogSource( resource_name=constants.OPERATOR_CATALOG_SOURCE_NAME, namespace=constants.MARKETPLACE_NAMESPACE, ) version_before_upgrade = config.ENV_DATA.get("ocs_version") upgrade_version = config.UPGRADE.get( "upgrade_ocs_version", version_before_upgrade ) parsed_version_before_upgrade = parse_version(version_before_upgrade) parsed_upgrade_version = parse_version(upgrade_version) assert parsed_upgrade_version >= parsed_version_before_upgrade, ( f"Version you would like to upgrade to: {upgrade_version} " f"is not higher or equal to the version you currently running: " f"{version_before_upgrade}" ) version_change = parsed_upgrade_version > parsed_version_before_upgrade if version_change: version_config_file = os.path.join( constants.CONF_DIR, 'ocs_version', f'ocs-{upgrade_version}.yaml' ) assert os.path.exists(version_config_file), ( f"OCS version config file {version_config_file} doesn't exist!" ) with open( os.path.abspath(os.path.expanduser(version_config_file)) ) as file_stream: custom_config_data = yaml.safe_load(file_stream) config.update(custom_config_data) image_url = ocs_catalog.get_image_url() image_tag = ocs_catalog.get_image_name() log.info(f"Current image is: {image_url}, tag: {image_tag}") ocs_registry_image = config.UPGRADE.get('upgrade_ocs_registry_image') if ocs_registry_image: image_url, new_image_tag = ocs_registry_image.split(':') elif config.UPGRADE.get('upgrade_to_latest', True) or version_change: new_image_tag = get_latest_ds_olm_tag() else: new_image_tag = get_next_version_available_for_upgrade(image_tag) cs_data = deepcopy(ocs_catalog.data) image_for_upgrade = ':'.join([image_url, new_image_tag]) log.info(f"Image: {image_for_upgrade} will be used for upgrade.") cs_data['spec']['image'] = image_for_upgrade operator_selector = get_selector_for_ocs_operator() package_manifest = PackageManifest( resource_name=OCS_OPERATOR_NAME, selector=operator_selector, ) csv_name_pre_upgrade = package_manifest.get_current_csv() log.info(f"CSV name before upgrade is: {csv_name_pre_upgrade}") csv_pre_upgrade = CSV( resource_name=csv_name_pre_upgrade, namespace=namespace ) pre_upgrade_images = get_images(csv_pre_upgrade.get()) with NamedTemporaryFile() as cs_yaml: dump_data_to_temp_yaml(cs_data, cs_yaml.name) ocs_catalog.apply(cs_yaml.name) # Wait for package manifest is ready package_manifest.wait_for_resource() subscription_plan_approval = config.DEPLOYMENT.get( 'subscription_plan_approval' ) if subscription_plan_approval == 'Manual': wait_for_install_plan_and_approve(namespace) attempts = 145 for attempt in range(1, attempts): if attempts == attempt: raise TimeoutException("No new CSV found after upgrade!") log.info(f"Attempt {attempt}/{attempts} to check CSV upgraded.") package_manifest.reload_data() csv_name_post_upgrade = package_manifest.get_current_csv() if csv_name_post_upgrade == csv_name_pre_upgrade: log.info(f"CSV is still: {csv_name_post_upgrade}") sleep(5) else: log.info(f"CSV now upgraded to: {csv_name_post_upgrade}") break csv_post_upgrade = CSV( resource_name=csv_name_post_upgrade, namespace=namespace ) log.info( f"Waiting for CSV {csv_name_post_upgrade} to be in succeeded state" ) if version_before_upgrade == '4.2' and upgrade_version == '4.3': log.info("Force creating Ceph toolbox after upgrade 4.2 -> 4.3") setup_ceph_toolbox(force_setup=True) csv_post_upgrade.wait_for_phase("Succeeded", timeout=600) post_upgrade_images = get_images(csv_post_upgrade.get()) old_images, _, _ = get_upgrade_image_info( pre_upgrade_images, post_upgrade_images ) verify_image_versions(old_images, parsed_upgrade_version) ocs_install_verification(timeout=600, skip_osd_distribution_check=True) ceph_cluster.disable_health_monitor() if ceph_cluster.health_error_status: CephHealthException( f"During upgrade hit Ceph HEALTH_ERROR: " f"{ceph_cluster.health_error_status}" )
def setup( self, request, scenario, num_of_nodes, num_of_fail_nodes, disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory, ): """ Identify the nodes and start DeploymentConfig based app pods using PVC with ReadWriteOnce (RWO) access mode on selected nodes Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test num_of_fail_nodes (int): number of nodes to make unresponsive during test disrupt_provisioner (bool): True to disrupt the leader provisioner pods if not running on selected nodes, else False project_factory: A fixture to create new project multi_pvc_factory: A fixture create a set of new PVCs dc_pod_factory: A fixture to create deploymentconfig pods Returns: tuple: containing the params used in test cases """ ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes( scenario, num_of_nodes) test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes logger.info(f"Using nodes {test_nodes} for running test") def finalizer(): helpers.remove_label_from_worker_node(node_list=test_nodes, label_key="nodetype") request.addfinalizer(finalizer) ceph_cluster = CephCluster() project = project_factory() # Wait for mon pods to reach expected count # Bug 1778273 - [RFE]: Configure 5 MONs for OCS cluster with 5 or more nodes # This wait is required for some of the previous OCS versions (< 4.5) current_mon_count = int( ceph_cluster.CEPHCLUSTER.get_resource(resource_name="", column="MONCOUNT")) assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=current_mon_count, timeout=900, ) ceph_cluster.mons = [] ceph_cluster.scan_cluster() # Select nodes for running app pods and inducing network failure later app_pod_nodes = self.select_nodes_for_app_pods(scenario, ceph_cluster, ocs_nodes, non_ocs_nodes, num_of_fail_nodes) # Create multiple RBD and CephFS backed PVCs with RWO accessmode num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes rbd_pvcs = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs, ) cephfs_pvcs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs, ) # Create deploymentconfig based pods dc_pods = [] # Start app-pods on selected node(s) for node_name in app_pod_nodes: logger.info(f"Starting app pods on the node {node_name}") helpers.label_worker_node(node_list=[node_name], label_key="nodetype", label_value="app-pod") for num in range(self.num_of_app_pods_per_node): dc_pods.append( dc_pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0), node_selector={"nodetype": "app-pod"}, )) assert pod.verify_node_name( dc_pods[-1], node_name ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" dc_pods.append( dc_pod_factory( interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0), node_selector={"nodetype": "app-pod"}, )) assert pod.verify_node_name( dc_pods[-1], node_name ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" helpers.remove_label_from_worker_node(node_list=[node_name], label_key="nodetype") # Label other test nodes to be able to run app pods later helpers.label_worker_node(node_list=test_nodes, label_key="nodetype", label_value="app-pod") # Get ceph mon,osd pods running on selected node if colocated scenario # and extra OCS nodes are present # Recovery steps for MON and OSDS not required from OCS 4.4 onwards # Refer to BZ 1830015 and BZ 1835908 ceph_pods = [] if float(config.ENV_DATA["ocs_version"]) < 4.4 and ( scenario == "colocated" and len(test_nodes) > 3): pods_to_check = ceph_cluster.osds # Skip mon pods if mon_count is 5 as there may not be enough nodes # for all mons to run after multiple node failures if ceph_cluster.mon_count == 3: pods_to_check.extend(ceph_cluster.mons) for pod_obj in pods_to_check: if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]: ceph_pods.append(pod_obj) logger.info( f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}" ) disruptor = [] if disrupt_provisioner: disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes) return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=10): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs, timeout=1200) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check( delay=120, tries=50), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def test_vdbench_workload(self, template, with_ocs, load, label_nodes, ripsaw, servers, threads, blocksize, fileio, samples, width, depth, files, file_size, runtime, pause): """ Run VDBench Workload Args : template (str) : Name of yaml file that will used as a template with_ocs (bool) : This parameter will indicate if the test will run on the same nodes as the OCS load (int) : load to run on the storage in percentage of the capacity. label_nodes (fixture) : This fixture is labeling the worker(s) that will used for App. pod(s) ripsaw (fixture) : Fixture to deploy the ripsaw benchmarking operator servers (int) : Number of servers (pods) that will run the IO threads (int) : Number of threads that will run on each server blocksize (list - str): List of BlockSize - must add the 'K' to it fileio (str) : How to select file for the IO : random / sequential samples (int) : Number of time(s) to run each test width (int) : Width of directory tree to create depth (int) : Depth of directory tree to create files (int) : Number of files to create in each directory file_size (int) : File size (in MB) to create runtime (int) : Time (in Sec.) for each test iteration pause (int) : Time (in Min.) to pause between each test iteration. """ log.info(f'going to use {template} as template') log.info("Apply Operator CRD") crd = 'resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml' ripsaw.apply_crd(crd) log.info('Running vdbench benchmark') if template: template = os.path.join(constants.TEMPLATE_VDBENCH_DIR, template) else: template = constants.VDBENCH_BENCHMARK_YAML sf_data = templating.load_yaml(template) target_results = template + 'Results' log.info('Calculating Storage size....') ceph_cluster = CephCluster() total_capacity = ceph_cluster.get_ceph_capacity() assert total_capacity > constants.VDBENCH_MIN_CAPACITY, ( "Storage capacity is too low for performance testing") log.info(f'The Total usable capacity is {total_capacity}') if load: width = constants.VDBENCH_WIDTH depth = constants.VDBENCH_DEPTH file_size = constants.VDBENCH_FILE_SIZE capacity_per_pod = constants.VDBENCH_CAP_PER_POD total_dirs = width**depth log.info(f'The total dirs in the tree {total_dirs}') log.info(f'Going to run with {load} % of the capacity load.') tested_capacity = round(total_capacity * 1024 * load / 100) log.info(f'Tested capacity is {tested_capacity} MB') servers = round(tested_capacity / capacity_per_pod) """ To spread the application pods evenly on all workers or application nodes and at least 2 app pods per node. """ nodes = len( node.get_typed_nodes(node_type=constants.WORKER_MACHINE)) if not with_ocs: nodes = len( machine.get_labeled_nodes( f'node-role.kubernetes.io/app={constants.APP_NODE_LABEL}' )) log.info(f'Going to use {nodes} nodes for the test !') servers = round(servers / nodes) * nodes if servers < (nodes * 2): servers = nodes * 2 files = round(tested_capacity / servers / total_dirs) total_files = round(files * servers * total_dirs) log.info(f'number of pods is {servers}') log.info(f'Going to create {total_files} files !') log.info(f'number of files in dir is {files}') """ Setting up the parameters for this test """ if servers: sf_data['spec']['workload']['args']['servers'] = servers target_results = target_results + '-' + str(servers) if threads: sf_data['spec']['workload']['args']['threads'] = threads target_results = target_results + '-' + str(threads) if fileio: sf_data['spec']['workload']['args']['fileio'] = fileio target_results = target_results + '-' + str(fileio) if samples: sf_data['spec']['workload']['args']['samples'] = samples target_results = target_results + '-' + str(samples) if width: sf_data['spec']['workload']['args']['width'] = width target_results = target_results + '-' + str(width) if depth: sf_data['spec']['workload']['args']['depth'] = depth target_results = target_results + '-' + str(depth) if files: sf_data['spec']['workload']['args']['files'] = files target_results = target_results + '-' + str(files) if file_size: sf_data['spec']['workload']['args']['file_size'] = file_size target_results = target_results + '-' + str(file_size) if runtime: sf_data['spec']['workload']['args']['runtime'] = runtime target_results = target_results + '-' + str(runtime) if pause: sf_data['spec']['workload']['args']['pause'] = pause target_results = target_results + '-' + str(pause) if len(blocksize) > 0: sf_data['spec']['workload']['args']['bs'] = blocksize target_results = target_results + '-' + '_'.join(blocksize) if with_ocs: if sf_data['spec']['workload']['args']['pin_server']: del sf_data['spec']['workload']['args']['pin_server'] """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ vol_size = int((files * total_dirs) * file_size * 1.3) log.info('number of files to create : {}'.format( int(files * (width**depth)))) log.info(f'The size of all files is : {vol_size}MB') vol_size = int(vol_size / 1024) if vol_size < 100: vol_size = 100 sf_data['spec']['workload']['args']['storagesize'] = f'{vol_size}Gi' log.debug(f'output of configuration file is {sf_data}') timeout = 86400 # 3600 (1H) * 24 (1D) = one days sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, 'vdbench-client', 'my-ripsaw'): try: if bench_pod[0] is not None: vdbench_client_pod = bench_pod[0] break except IndexError: log.info('Benchmark client pod not ready yet') bench_pod = OCP(kind='pod', namespace='my-ripsaw') log.info('Waiting for VDBench benchmark to Run') assert bench_pod.wait_for_resource(condition=constants.STATUS_RUNNING, resource_name=vdbench_client_pod, sleep=30, timeout=600) start_time = time.time() while True: logs = bench_pod.exec_oc_cmd(f'logs {vdbench_client_pod}', out_yaml_format=False) if 'Test Run Finished' in logs: log.info('VdBench Benchmark Completed Successfully') break if timeout < (time.time() - start_time): raise TimeoutError( 'Timed out waiting for benchmark to complete') time.sleep(30) # Getting the results file from the benchmark pod and put it with the # test logs. # TODO: find the place of the actual test log and not in the parent # logs path target_results = '{}/{}.tgz'.format(ocsci_log_path(), target_results) pod_results = constants.VDBENCH_RESULTS_FILE retrive_files_from_pod(vdbench_client_pod, target_results, pod_results)