def test_scale_mcg_obc_creation(self, tmp_path, timeout=60): """ MCG OBC creation using Noobaa storage class """ log.info(f"Start creating {self.scale_obc_count} " f"OBC in a batch of {self.num_obc_batch}") for i in range(int(self.scale_obc_count / self.num_obc_batch)): obc_dict_list = ( scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job( no_of_obc=self.num_obc_batch, sc_name=self.sc_name, namespace=self.namespace, )) # Create job profile job_file = ObjectConfFile( name="job_profile", obj_dict_list=obc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_file.create(namespace=self.namespace) time.sleep(timeout * 5) # Check all the PVC reached Bound state obc_bound_list = ( scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job( kube_job_obj=job_file, namespace=self.namespace, no_of_obc=self.num_obc_batch, )) log.info(f"Number of PVCs in Bound state {len(obc_bound_list)}") # Delete obc on cluster scale_noobaa_lib.cleanup(self.namespace)
def test_scale_mcg_rgw_obc_creation(self, tmp_path, timeout=60): """ OBC creation for both MCG and RGW storage class This test case only runs on vSphere cluster deployment """ log.info( f"Start creating {self.scale_obc_count} OBC in a batch of {self.num_obc_batch}" ) for i in range(int(self.scale_obc_count / self.num_obc_batch)): obc_dict_list1 = ( scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job( no_of_obc=int(self.num_obc_batch / 2), sc_name=self.sc_name, namespace=self.namespace, )) obc_dict_list2 = ( scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job( no_of_obc=int(self.num_obc_batch / 2), sc_name=self.sc_rgw_name, namespace=self.namespace, )) # Create job profile job_file1 = ObjectConfFile( name="job_profile1", obj_dict_list=obc_dict_list1, project=self.namespace, tmp_path=tmp_path, ) job_file2 = ObjectConfFile( name="job_profile2", obj_dict_list=obc_dict_list2, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_file1.create(namespace=self.namespace) time.sleep(timeout * 3) job_file2.create(namespace=self.namespace) time.sleep(timeout * 3) # Check all the PVC reached Bound state obc_mcg_bound_list = ( scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job( kube_job_obj=job_file1, namespace=self.namespace, no_of_obc=int(self.num_obc_batch / 2), )) obc_rgw_bound_list = ( scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job( kube_job_obj=job_file2, namespace=self.namespace, no_of_obc=int(self.num_obc_batch / 2), )) log.info( f"Number of OBCs in Bound state MCG: {len(obc_mcg_bound_list)}," f" RGW: {len(obc_rgw_bound_list)}") # Delete obc on cluster scale_noobaa_lib.cleanup(self.namespace)
def create_fio_pod(project, interface, pvc_factory, storageclass, access_mode, fio_job_dict, fio_configmap_dict, tmp_path, volume_mode=None, pvc_size=10): """ Create pods for upgrade testing. Args: project (obj): Project in which to create resources interface (str): CephBlockPool or CephFileSystem pvc_factory (function): Function for creating PVCs storageclass (obj): Storageclass to use access_mode (str): ReadWriteOnce, ReadOnlyMany or ReadWriteMany. This decides the access mode to be used for the PVC fio_job_dict (dict): fio job dictionary to use fio_configmap_dict (dict): fio configmap dictionary to use tmp_path (obj): reference to tmp_path fixture object volume_mode (str): Volume mode for rbd RWO PVC pvc_size (int): Size of PVC in GiB Return: list: List of generated pods """ log.info(f"Creating pod via {interface} using {access_mode}" f" access mode, {volume_mode} volume mode and {storageclass.name}" f" storageclass") pvc = pvc_factory(project=project, storageclass=storageclass, access_mode=access_mode, volume_mode=volume_mode, size=pvc_size, status=None) helpers.wait_for_resource_state(pvc, constants.STATUS_BOUND, timeout=600) job_volume = fio_job_dict['spec']['template']['spec']['volumes'][0] job_volume['persistentVolumeClaim']['claimName'] = pvc.name fio_objs = [fio_configmap_dict, fio_job_dict] job_file = ObjectConfFile("fio_continuous", fio_objs, project, tmp_path) # deploy the Job to the cluster and start it job_file.create() ocp_pod_obj = ocp.OCP(kind=constants.POD, namespace=project.namespace) pods = ocp_pod_obj.get()['items'] for pod in pods: pod_volume = pod['spec']['volumes'][0] if pod_volume['persistentVolumeClaim']['claimName'] == pvc.name: pod_data = pod break return Pod(**pod_data)
def test_start_fio_job( tmp_path, fio_pvc_dict, fio_job_dict, fio_configmap_dict, ): """ Start a fio job performing IO load, check that it's running, and keep it running even after the test finishes. """ # creating project directly to set it's name and prevent it's deletion project = ocp.OCP(kind="Project", namespace=TEST_NS) project.new_project(TEST_NS) # size of the volume for fio pvc_size = 10 # GiB # test uses cephfs based volume, could be either parametrized or we can # try to start more jobs storage_class_name = "ocs-storagecluster-cephfs" # fio config file: random mixed read and write IO will be running for one # day (we expect that the other test will stop it), only 1/2 of the volume # is used, we don't need to utilize the PV 100% fio_size = int(pvc_size / 2) # GiB fio_conf = textwrap.dedent(f""" [readwrite] readwrite=randrw buffered=1 blocksize=4k ioengine=libaio directory=/mnt/target size={fio_size}G time_based runtime=24h """) # put the dicts together into yaml file of the Job fio_configmap_dict["data"]["workload.fio"] = fio_conf fio_pvc_dict["spec"]["storageClassName"] = storage_class_name fio_pvc_dict["spec"]["resources"]["requests"]["storage"] = f"{pvc_size}Gi" fio_objs = [fio_pvc_dict, fio_configmap_dict, fio_job_dict] job_file = ObjectConfFile("fio_continuous", fio_objs, project, tmp_path) # deploy the Job to the cluster and start it job_file.create() # wait for a pod for the job to be deployed and running ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace) try: ocp_pod.wait_for_resource(resource_count=1, condition=constants.STATUS_RUNNING, timeout=300, sleep=30) except TimeoutExpiredError: logger.error("pod for fio job wasn't deployed properly") raise
def test_scale_obc_creation_noobaa_pod_respin(self, tmp_path, pod_name, sc_name, mcg_job_factory): """ OBC creation using RGW storage class This test case only runs on vSphere cluster deployment """ # Create OBCs with FIO running using mcg_job_factory() for i in range(self.scale_obc_count_io): exec(f"job{i} = mcg_job_factory()") log.info(f"Start creating {self.scale_obc_count} " f"OBC in a batch of {self.num_obc_batch}") for i in range(int(self.scale_obc_count / self.num_obc_batch)): obc_dict_list = ( scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job( no_of_obc=self.num_obc_batch, sc_name=sc_name, namespace=self.namespace, )) # Create job profile job_file = ObjectConfFile( name="job_profile", obj_dict_list=obc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_file.create(namespace=self.namespace) # Check all the OBCs reached Bound state obc_bound_list = ( scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job( kube_job_obj=job_file, namespace=self.namespace, no_of_obc=self.num_obc_batch, )) log.info(f"Number of OBCs in Bound state: {len(obc_bound_list)}") # Reset node which noobaa pods is running on # And validate noobaa pods are re-spinned and in running state scale_noobaa_lib.noobaa_running_node_restart(pod_name=pod_name) # Verify all OBCs are in Bound state after node restart log.info("Verify all OBCs are in Bound state after node restart.....") obc_status_list = scale_noobaa_lib.check_all_obcs_status( namespace=self.namespace) log.info(f"Number of OBCs in Bound state after node reset: " f"{len(obc_status_list[0])}") assert (len(obc_status_list[0]) == self.scale_obc_count ), "Not all OBCs in Bound state"
def create_workload_job(job_name, bucket, project, mcg_obj, resource_path, custom_options=None): """ Creates kubernetes job that should utilize MCG bucket. Args: job_name (str): Name of the job bucket (objt): MCG bucket with S3 interface project (obj): OCP object representing OCP project which will be used for the job mcg_obj (obj): instance of MCG class resource_path (str): path to directory where should be created resources custom_options (dict): Dictionary of lists containing tuples with additional configuration for fio in format: {'section': [('option', 'value'),...],...} e.g. {'global':[('name','bucketname')],'create':[('time_based','1'),('runtime','48h')]} Those values can be added to the config or rewrite already existing values Returns: obj: Job object """ fio_job_dict = get_job_dict(job_name) fio_configmap_dict = get_configmap_dict(fio_job_dict, mcg_obj, bucket, custom_options) fio_objs = [fio_configmap_dict, fio_job_dict] log.info(f"Creating MCG workload job {job_name}") job_file = ObjectConfFile("fio_continuous", fio_objs, project, resource_path) # deploy the Job to the cluster and start it job_file.create() log.info(f"Job {job_name} created") # get job object ocp_job_obj = ocp.OCP(kind=constants.JOB, namespace=project.namespace) job = OCS(**ocp_job_obj.get(resource_name=job_name)) return job
def test_scale_obc_pre_upgrade(tmp_path, timeout=60): """ Create scaled MCG OBC using Noobaa storage class before upgrade Save scaled obc data in a file for post upgrade validation """ obc_scaled_list = [] log.info(f"Start creating {scale_obc_count} " f"OBC in a batch of {num_obc_batch}") for i in range(int(scale_obc_count / num_obc_batch)): obc_dict_list = scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job( no_of_obc=num_obc_batch, sc_name=sc_name, namespace=namespace, ) # Create job profile job_file = ObjectConfFile( name="job_profile", obj_dict_list=obc_dict_list, project=namespace, tmp_path=tmp_path, ) # Create kube_job job_file.create(namespace=namespace) time.sleep(timeout * 5) # Check all the OBCs reached Bound state obc_bound_list = scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job( kube_job_obj=job_file, namespace=namespace, no_of_obc=num_obc_batch, ) obc_scaled_list.extend(obc_bound_list) log.info( f"Number of OBCs in scaled list: {len(obc_scaled_list)}", ) # Write namespace, OBC data in a OBC_SCALE_DATA_FILE which # will be used during post_upgrade validation tests with open(obc_scaled_data_file, "a+") as w_obj: w_obj.write(str("# Scale Data File\n")) w_obj.write(str(f"NAMESPACE: {namespace}\n")) w_obj.write(str(f"OBC_SCALE_LIST: {obc_scaled_list}\n"))
def mcg_workload_job(fio_job_dict_mcg, fio_configmap_dict_mcg, fio_conf_mcg, fio_project_mcg, tmp_path, request): """ Creates kubernetes job that should utilize MCG during upgrade. Returns: object: Job object """ fio_configmap_dict_mcg["data"]["workload.fio"] = fio_conf_mcg fio_objs = [fio_configmap_dict_mcg, fio_job_dict_mcg] job_name = fio_job_dict_mcg['metadata']['name'] log.info(f"Creating job {job_name}") job_file = ObjectConfFile("fio_continuous", fio_objs, fio_project_mcg, tmp_path) # deploy the Job to the cluster and start it job_file.create() log.info(f"Job {job_name} created") # get job object ocp_job_obj = ocp.OCP(kind=constants.JOB, namespace=fio_project_mcg.namespace) job = OCS(**ocp_job_obj.get(resource_name=job_name)) def teardown(): """ Delete mcg job """ job.delete() job.ocp.wait_for_delete(job.name) request.addfinalizer(teardown) return job
def test_log_reader_writer_parallel(project, tmp_path): """ Write and read logfile stored on cephfs volume, from all worker nodes of a cluster via k8s Deployment, while fetching content of the stored data via oc rsync to check the data locally. Reproduces BZ 1989301. Test failure means new blocker high priority bug. """ pvc_dict = get_pvc_dict() # we need to mount the volume on every worker node, so RWX/cephfs pvc_dict["metadata"]["name"] = "logwriter-cephfs-many" pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX] if ( config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS ) and storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS else: sc_name = constants.CEPHFILESYSTEM_SC pvc_dict["spec"]["storageClassName"] = sc_name # there is no need for lot of storage capacity for this test pvc_dict["spec"]["resources"]["requests"]["storage"] = "1Gi" # get deployment dict for the reproducer logwriter workload with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file: deploy_dict = yaml.safe_load(deployment_file.read()) # if we are running in disconnected environment, we need to mirror the # container image first, and then use the mirror instead of the original if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image(deploy_dict["spec"]["template"]) # we need to match deployment replicas with number of worker nodes deploy_dict["spec"]["replicas"] = len(get_worker_nodes()) # drop topology spread constraints related to zones topology.drop_topology_constraint( deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL ) # and link the deployment with the pvc try: link_spec_volume( deploy_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", pvc_dict["metadata"]["name"], ) except Exception as ex: error_msg = "LOGWRITER_CEPHFS_REPRODUCER no longer matches code of this test" raise Exception(error_msg) from ex # prepare k8s yaml file for deployment workload_file = ObjectConfFile( "log_reader_writer_parallel", [pvc_dict, deploy_dict], project, tmp_path ) # deploy the workload, starting the log reader/writer pods logger.info( "starting log reader/writer workload via Deployment, one pod per worker" ) workload_file.create() logger.info("waiting for all pods of the workload Deployment to run") ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace) try: ocp_pod.wait_for_resource( resource_count=deploy_dict["spec"]["replicas"], condition=constants.STATUS_RUNNING, error_condition=constants.STATUS_ERROR, timeout=300, sleep=30, ) except Exception as ex: # this is not a problem with feature under test, but with infra, # cluster configuration or unrelated bug which must have happened # before this test case error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken" logger.exception(error_msg) logger.debug(workload_file.describe()) raise exceptions.UnexpectedBehaviour(error_msg) from ex # while the workload is running, we will try to fetch and validate data # from the cephfs volume of the workload 120 times (this number of retries # is a bit larger than usual number required to reproduce bug from # BZ 1989301, but we need to be sure here) number_of_fetches = 120 # if given fetch fail, we will ignore the failure unless the number of # failures is too high (this has no direct impact on feature under test, # we should be able to detect the bug even with 10% of rsync failures, # since data corruption doesn't simply go away ...) number_of_failures = 0 allowed_failures = 12 is_local_data_ok = True local_dir = tmp_path / "logwriter" local_dir.mkdir() workload_pods = ocp_pod.get() workload_pod_name = workload_pods["items"][0]["metadata"]["name"] logger.info( "while the workload is running, we will fetch and check data from the cephfs volume %d times", number_of_fetches, ) for _ in range(number_of_fetches): # fetch data from cephfs volume into the local dir oc_cmd = [ "oc", "rsync", "--loglevel=4", "-n", project.namespace, f"pod/{workload_pod_name}:/mnt/target", local_dir, ] try: run_cmd(cmd=oc_cmd, timeout=300) except Exception as ex: number_of_failures += 1 # in case this fails, we are going to fetch extra evidence, that # said such failure is most likely related to OCP or infrastructure error_msg = "oc rsync failed: something is wrong with the cluster" logger.exception(error_msg) logger.debug(workload_file.describe()) oc_rpm_debug = [ "oc", "rsh", "-n", project.namespace, f"pod/{workload_pod_name}", "bash", "-c", ";".join( [ "rpm -qa", "rpm -qaV", "type -a tar", "tar --version", "type -a rsync", "rsync --version", ] ), ] try: run_cmd(cmd=oc_rpm_debug, timeout=600) except Exception: # if fetch of additional evidence fails, log and ignore the # exception (so that we can retry if needed) logger.exception("failed to fetch additional evidence") # in case the rsync run failed because of a container restart, # we assume the pod name hasn't changed, and just wait for the # container to be running again - unless the number of rsync # failures is too high if number_of_failures > allowed_failures: logger.error("number of ignored rsync failures is too high") else: ocp_pod.wait_for_resource( resource_count=deploy_dict["spec"]["replicas"], condition=constants.STATUS_RUNNING, error_condition=constants.STATUS_ERROR, timeout=300, sleep=30, ) continue logger.debug( "before this failure, we ignored %d previous failures", number_of_failures, ) raise exceptions.UnexpectedBehaviour(error_msg) from ex # look for null bytes in the just fetched local files in target dir, # and if these binary bytes are found, the test failed (the bug # was reproduced) target_dir = os.path.join(local_dir, "target") for file_name in os.listdir(target_dir): with open(os.path.join(target_dir, file_name), "r") as fo: data = fo.read() if "\0" in data: is_local_data_ok = False logger.error( "file %s is corrupted: null byte found in a text file", file_name, ) # is_local_data_ok = False assert is_local_data_ok, "data corruption detected" time.sleep(2) logger.debug("number of ignored rsync failures: %d", number_of_failures) # if no obvious problem was detected, run the logreader job to validate # checksums in the log files (so that we are 100% sure that nothing went # wrong with the IO or the data) with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file: job_dict = yaml.safe_load(job_file.read()) # mirroring for disconnected environment, if necessary if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image(job_dict["spec"]["template"]) # drop topology spread constraints related to zones topology.drop_topology_constraint( job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL ) # we need to match number of jobs with the number used in the workload job_dict["spec"]["completions"] = deploy_dict["spec"]["replicas"] job_dict["spec"]["parallelism"] = deploy_dict["spec"]["replicas"] # and reffer to the correct pvc name try: link_spec_volume( job_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", pvc_dict["metadata"]["name"], ) except Exception as ex: error_msg = "LOGWRITER_CEPHFS_READER no longer matches code of this test" raise Exception(error_msg) from ex # prepare k8s yaml file for the job job_file = ObjectConfFile("log_reader", [job_dict], project, tmp_path) # deploy the job, starting the log reader pods logger.info( "starting log reader data validation job to fully check the log data", ) job_file.create() # wait for the logreader job to complete (this should be rather quick) try: job.wait_for_job_completion( job_name=job_dict["metadata"]["name"], namespace=project.namespace, timeout=300, sleep_time=30, ) except exceptions.TimeoutExpiredError: error_msg = ( "verification failed to complete in time: data loss or broken cluster?" ) logger.exception(error_msg) # and then check that the job completed with success logger.info("checking the result of data validation job") logger.debug(job_file.describe()) ocp_job = ocp.OCP( kind="Job", namespace=project.namespace, resource_name=job_dict["metadata"]["name"], ) job_status = ocp_job.get()["status"] logger.info("last status of data verification job: %s", job_status) if ( "failed" in job_status or job_status["succeeded"] != deploy_dict["spec"]["replicas"] ): error_msg = "possible data corruption: data verification job failed!" logger.error(error_msg) job.log_output_of_job_pods( job_name=job_dict["metadata"]["name"], namespace=project.namespace ) raise Exception(error_msg)
def test_bulk_clone_performance(self, namespace, tmp_path, pod_factory): """ Creates number of PVCs in a bulk using kube job Write 60% of PVC capacity to each one of the created PVCs Creates 1 clone per each PVC altogether in a bulk Measuring time for bulk of clones creation """ pvc_count = 50 log.info(f"Start creating {self.interface} {pvc_count} PVC") if self.interface == constants.CEPHBLOCKPOOL: sc_name = constants.DEFAULT_STORAGECLASS_RBD clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML elif self.interface == constants.CEPHFILESYSTEM: sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=pvc_count, access_mode=constants.ACCESS_MODE_RWO, sc_name=sc_name, pvc_size="5Gi", ) job_pvc_file = ObjectConfFile( name="job_profile_pvc", obj_dict_list=pvc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_pvc_file.create(namespace=self.namespace) # Check all the PVC reached Bound state pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_pvc_file, namespace=self.namespace, no_of_pvc=pvc_count, ) logging.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}") total_files_size = self.run_fio_on_pvcs(pvc_dict_list, pod_factory) clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job( pvc_dict_list, clone_yaml, sc_name) logging.info("Created clone dict list") job_clone_file = ObjectConfFile( name="job_profile_clone", obj_dict_list=clone_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job that creates clones job_clone_file.create(namespace=self.namespace) logging.info("Going to check bound status for clones") # Check all the clones reached Bound state clone_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_clone_file, namespace=self.namespace, no_of_pvc=pvc_count, timeout=200, ) logging.info( f"Number of clones in Bound state {len(clone_bound_list)}") clone_objs = [] all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace) for clone_yaml in clone_dict_list: name = clone_yaml["metadata"]["name"] size = clone_yaml["spec"]["resources"]["requests"]["storage"] logging.info(f"Clone {name} of size {size} created") for pvc_obj in all_pvc_objs: if pvc_obj.name == name: clone_objs.append(pvc_obj) assert len(clone_bound_list) == len( clone_objs ), "Not all clones reached BOUND state, cannot measure time" start_time = helpers.get_provision_time(self.interface, clone_objs, status="start") end_time = helpers.get_provision_time(self.interface, clone_objs, status="end") total_time = (end_time - start_time).total_seconds() speed = round(total_files_size / total_time, 2) logging.info( f"Total creation time = {total_time} secs, data size = {total_files_size} MB, speed = {speed} MB/sec " f"for {self.interface} clone in bulk of {pvc_count} clones.")
class TestBulkPodAttachPerformance(PASTest): """ Test to measure performance of attaching pods to pvc in a bulk """ pvc_size = "1Gi" def setup(self): """ Setting up test parameters """ log.info("Starting the test setup") super(TestBulkPodAttachPerformance, self).setup() self.benchmark_name = "bulk_pod_attach_time" self.create_test_project() # Pulling the pod image to the worker node, so pull image will not calculate # in the total attach time helpers.pull_images(constants.PERF_IMAGE) # Initializing some parameters self.pvc_objs = list() self.pods_obj = locals() def teardown(self): """ Cleanup the test environment """ log.info("Starting the test environment celanup") # Deleting All POD(s) log.info("Try to delete all created PODs") try: self.pods_obj.delete(namespace=self.namespace) except Exception as ex: log.warn(f"Failed to delete POD(s) [{ex}]") log.info("Wait for all PODs to be deleted") performance_lib.wait_for_resource_bulk_status("pod", 0, self.namespace, constants.STATUS_BOUND, len(self.pvc_objs) * 2, 10) log.info("All POD(s) was deleted") # Deleting PVC(s) for deletion time mesurment log.info("Try to delete all created PVCs") for pvc_obj in self.pvc_objs: pvc_obj.delete() log.info("Wait for all PVC(s) to be deleted") performance_lib.wait_for_resource_bulk_status("pvc", 0, self.namespace, constants.STATUS_BOUND, len(self.pvc_objs) * 2, 10) log.info("All PVC(s) was deleted") log.info("Wait for all PVC(s) backed PV(s) to be deleted") # Timeout for each PV to be deleted is 20 sec. performance_lib.wait_for_resource_bulk_status("pv", 0, self.namespace, self.namespace, len(self.pvc_objs) * 20, 10) log.info("All backed PV(s) was deleted") # Delete the test project (namespace) self.delete_test_project() super(TestBulkPodAttachPerformance, self).teardown() @pytest.mark.parametrize( argnames=["interface_type", "bulk_size"], argvalues=[ pytest.param(*[constants.CEPHBLOCKPOOL, 120], ), pytest.param(*[constants.CEPHBLOCKPOOL, 240], ), pytest.param(*[constants.CEPHFILESYSTEM, 120], ), pytest.param(*[constants.CEPHFILESYSTEM, 240], ), ], ) @polarion_id("OCS-1620") def test_bulk_pod_attach_performance(self, interface_type, bulk_size): """ Measures pods attachment time in bulk_size bulk Args: interface_type (str): The interface type to be tested - CephBlockPool / CephFileSystem. bulk_size (int): Size of the bulk to be tested Returns: """ self.interface = interface_type if self.dev_mode: bulk_size = 3 # Initialize some variables timeout = bulk_size * 5 pvc_names_list = list() pod_data_list = list() # Getting the test start time test_start_time = self.get_time() csi_start_time = self.get_time("csi") log.info(f"Start creating bulk of new {bulk_size} PVCs") self.pvc_objs, _ = helpers.create_multiple_pvcs( sc_name=Interfaces_info[self.interface]["sc"], namespace=self.namespace, number_of_pvc=bulk_size, size=self.pvc_size, burst=True, do_reload=False, ) log.info("Wait for all of the PVCs to be in Bound state") performance_lib.wait_for_resource_bulk_status("pvc", bulk_size, self.namespace, constants.STATUS_BOUND, timeout, 10) # in case of creation faliure, the wait_for_resource_bulk_status function # will raise an exception. so in this point the creation succeed log.info("All PVCs was created and in Bound state.") # Reload all PVC(s) information for pvc_obj in self.pvc_objs: pvc_obj.reload() pvc_names_list.append(pvc_obj.name) log.debug(f"The PVCs names are : {pvc_names_list}") # Create kube_job for pod creation pod_data_list.extend( scale_lib.attach_multiple_pvc_to_pod_dict( pvc_list=pvc_names_list, namespace=self.namespace, pvcs_per_pod=1, )) self.pods_obj = ObjectConfFile( name="pod_kube_obj", obj_dict_list=pod_data_list, project=self.namespace, tmp_path=pathlib.Path(ocsci_log_path()), ) log.debug(f"PODs data list is : {json.dumps(pod_data_list, indent=3)}") log.info(f"{self.interface} : Before pod attach") bulk_start_time = time.time() self.pods_obj.create(namespace=self.namespace) # Check all the PODs reached Running state log.info("Checking that pods are running") performance_lib.wait_for_resource_bulk_status("pod", bulk_size, self.namespace, constants.STATUS_RUNNING, timeout, 2) log.info("All POD(s) are in Running State.") bulk_end_time = time.time() bulk_total_time = bulk_end_time - bulk_start_time log.info( f"Bulk attach time of {bulk_size} pods is {bulk_total_time} seconds" ) csi_bulk_total_time = performance_lib.pod_bulk_attach_csi_time( self.interface, self.pvc_objs, csi_start_time, self.namespace) # Collecting environment information self.get_env_info() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse(self.uuid, self.crd_data, self.full_log_path, "pod_bulk_attachtime")) full_results.add_key("storageclass", Interfaces_info[self.interface]["name"]) full_results.add_key("pod_bulk_attach_time", bulk_total_time) full_results.add_key("pod_csi_bulk_attach_time", csi_bulk_total_time) full_results.add_key("pvc_size", self.pvc_size) full_results.add_key("bulk_size", bulk_size) # Getting the test end time test_end_time = self.get_time() # Add the test time to the ES report full_results.add_key("test_time", { "start": test_start_time, "end": test_end_time }) # Write the test results into the ES server self.results_path = helpers.get_full_test_logs_path(cname=self) if full_results.es_write(): res_link = full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") # Create text file with results of all subtests (4 - according to the parameters) self.write_result_to_file(res_link) def test_bulk_pod_attach_results(self): """ This is not a test - it is only check that previous test ran and finish as expected and reporting the full results (links in the ES) of previous tests (4) """ self.add_test_to_results_check( test="test_bulk_pod_attach_performance", test_count=4, test_name="Bulk Pod Attach Time", ) self.check_results_and_push_to_dashboard() def init_full_results(self, full_results): """ Initialize the full results object which will send to the ES server Args: full_results (obj): an empty ResultsAnalyse object Returns: ResultsAnalyse (obj): the input object filled with data """ for key in self.environment: full_results.add_key(key, self.environment[key]) return full_results
def fetch_and_validate_data(self): """ While the workload is running, try to validate the data from the cephfs volume of the workload. Raise: NotFoundError: When the given volume is not found in given spec Exception: When the data verification job failed """ # if no obvious problem was detected, run the logreader job to validate # checksums in the log files (so that we are 100% sure that nothing went # wrong with the IO or the data) with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file: job_dict = yaml.safe_load(job_file.read()) # if we are running in disconnected environment, we need to mirror the # container image first, and then use the mirror instead of the original if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image( self.deploy_dict["spec"]["template"]) # drop topology spread constraints related to zones topology.drop_topology_constraint(job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL) # we need to match number of jobs with the number used in the workload job_dict["spec"]["completions"] = self.deploy_dict["spec"]["replicas"] job_dict["spec"]["parallelism"] = self.deploy_dict["spec"]["replicas"] # and reffer to the correct pvc name try: link_spec_volume( job_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", self.pvc_dict["metadata"]["name"], ) except (exceptions.NotFoundError, KeyError) as ex: logger.warning( "Failed to link the deployment with the pvc. We may need to check if the " "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test" ) raise ex # prepare k8s yaml file for the job job_file = ObjectConfFile("log_reader", [job_dict], self.project, self.tmp_path) # deploy the job, starting the log reader pods logger.info( "starting log reader data validation job to fully check the log data", ) job_file.create() # wait for the logreader job to complete (this should be rather quick) try: job.wait_for_job_completion( job_name=job_dict["metadata"]["name"], namespace=self.project.namespace, timeout=300, sleep_time=30, ) except exceptions.TimeoutExpiredError: error_msg = "verification failed to complete in time: probably data loss or broken cluster" raise Exception(error_msg) # and then check that the job completed with success logger.info("checking the result of data validation job") logger.debug(job_file.describe()) ocp_job = ocp.OCP( kind="Job", namespace=self.project.namespace, resource_name=job_dict["metadata"]["name"], ) job_status = ocp_job.get()["status"] logger.info("last status of data verification job: %s", job_status) if ("failed" in job_status or job_status["succeeded"] != self.deploy_dict["spec"]["replicas"]): error_msg = "possible data corruption: data verification job failed!" logger.error(error_msg) job.log_output_of_job_pods(job_name=job_dict["metadata"]["name"], namespace=self.project.namespace) raise Exception(error_msg)
def test_all_4_type_pvc_creation_deletion_scale(self, namespace, tmp_path): """ Measuring PVC creation time while scaling PVC of all 4 types, A total of 500 times the number of worker nodes will be created, i.e. 375 each pvc type Measure PVC deletion time in scale env """ scale_pvc_count = scale_lib.get_max_pvc_count() log.info(f"Start creating {scale_pvc_count} PVC of all 4 types") cephfs_sc_obj = constants.DEFAULT_STORAGECLASS_CEPHFS rbd_sc_obj = constants.DEFAULT_STORAGECLASS_RBD # Get pvc_dict_list, append all the pvc.yaml dict to pvc_dict_list rbd_pvc_dict_list, cephfs_pvc_dict_list = ([] for i in range(2)) for mode in [constants.ACCESS_MODE_RWO, constants.ACCESS_MODE_RWX]: rbd_pvc_dict_list.extend( scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=int(scale_pvc_count / 4), access_mode=mode, sc_name=rbd_sc_obj, )) cephfs_pvc_dict_list.extend( scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=int(scale_pvc_count / 4), access_mode=mode, sc_name=cephfs_sc_obj, )) # There is 2 kube_job for cephfs and rbd PVCs job_file_rbd = ObjectConfFile( name="rbd_pvc_job", obj_dict_list=rbd_pvc_dict_list, project=self.namespace, tmp_path=tmp_path, ) job_file_cephfs = ObjectConfFile( name="cephfs_pvc_job", obj_dict_list=cephfs_pvc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_file_rbd.create(namespace=self.namespace) job_file_cephfs.create(namespace=self.namespace) # Check all the PVC reached Bound state rbd_pvc_name = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_file_rbd, namespace=self.namespace, no_of_pvc=int(scale_pvc_count / 2), ) fs_pvc_name = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_file_cephfs, namespace=self.namespace, no_of_pvc=int(scale_pvc_count / 2), ) # Get pvc objs from namespace, which is used to identify backend pv rbd_pvc_obj, cephfs_pvc_obj = ([] for i in range(2)) pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace) for pvc_obj in pvc_objs: if pvc_obj.backed_sc == constants.DEFAULT_STORAGECLASS_RBD: rbd_pvc_obj.append(pvc_obj) elif pvc_obj.backed_sc == constants.DEFAULT_STORAGECLASS_CEPHFS: cephfs_pvc_obj.append(pvc_obj) # Get PVC creation time fs_pvc_create_time = helpers.measure_pvc_creation_time_bulk( interface=constants.CEPHFS_INTERFACE, pvc_name_list=fs_pvc_name) rbd_pvc_create_time = helpers.measure_pvc_creation_time_bulk( interface=constants.CEPHBLOCKPOOL, pvc_name_list=rbd_pvc_name) fs_pvc_create_time.update(rbd_pvc_create_time) # TODO: Update below code with google API, to record value in spreadsheet # TODO: For now observing Google API limit to write more than 100 writes log_path = f"{ocsci_log_path()}/All-type-PVC" with open(f"{log_path}-creation-time.csv", "w") as fd: csv_obj = csv.writer(fd) for k, v in fs_pvc_create_time.items(): csv_obj.writerow([k, v]) log.info(f"Create data present in {log_path}-creation-time.csv file") # Get pv_name, require pv_name to fetch deletion time data from log rbd_pv_list, fs_pv_list = ([] for i in range(2)) get_rbd_kube_job = job_file_rbd.get(namespace=self.namespace) for i in range(int(scale_pvc_count / 2)): rbd_pv_list.append( get_rbd_kube_job["items"][i]["spec"]["volumeName"]) get_fs_kube_job = job_file_cephfs.get(namespace=self.namespace) for i in range(int(scale_pvc_count / 2)): fs_pv_list.append( get_fs_kube_job["items"][i]["spec"]["volumeName"]) # Delete kube_job job_file_rbd.delete(namespace=self.namespace) job_file_cephfs.delete(namespace=self.namespace) # Adding 1min wait time for PVC deletion logs to be updated # Observed failure when we immediately check the logs for pvc delete time # https://github.com/red-hat-storage/ocs-ci/issues/3371 time.sleep(60) # Get PV deletion time fs_pvc_deletion_time = helpers.measure_pv_deletion_time_bulk( interface=constants.CEPHFS_INTERFACE, pv_name_list=fs_pv_list) rbd_pvc_deletion_time = helpers.measure_pv_deletion_time_bulk( interface=constants.CEPHBLOCKPOOL, pv_name_list=rbd_pv_list) fs_pvc_deletion_time.update(rbd_pvc_deletion_time) # TODO: Update below code with google API, to record value in spreadsheet # TODO: For now observing Google API limit to write more than 100 writes with open(f"{log_path}-deletion-time.csv", "w") as fd: csv_obj = csv.writer(fd) for k, v in fs_pvc_deletion_time.items(): csv_obj.writerow([k, v]) log.info(f"Delete data present in {log_path}-deletion-time.csv file") end_time = default_timer() log.info(f"Elapsed time -- {end_time - self.start_time} seconds")
def test_scale_obc_create_delete_time(self, tmp_path): """ MCG OBC creation and deletion using Noobaa MCG storage class """ log.info(f"Start creating {self.scale_obc_count} " f"OBCs in a batch of {self.num_obc_batch}") obc_create = dict() obc_delete = dict() for i in range(int(self.scale_obc_count / self.num_obc_batch)): obc_dict_list = ( scale_noobaa_lib.construct_obc_creation_yaml_bulk_for_kube_job( no_of_obc=self.num_obc_batch, sc_name=constants.NOOBAA_SC, namespace=self.namespace, )) # Create job profile job_file = ObjectConfFile( name="job_profile", obj_dict_list=obc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_file.create(namespace=self.namespace) # Check all the OBCs to reach Bound state obc_bound_list = ( scale_noobaa_lib.check_all_obc_reached_bound_state_in_kube_job( kube_job_obj=job_file, namespace=self.namespace, no_of_obc=self.num_obc_batch, )) log.info(f"Number of OBCs in Bound state {len(obc_bound_list)}") # Measure obc creation and deletion time obc_creation_time = scale_noobaa_lib.measure_obc_creation_time( obc_name_list=obc_bound_list) obc_create.update(obc_creation_time) # Delete all obcs in a batch obc_name_list = list(oc_get_all_obc_names()) new_list = [ obc_name_list[i:i + 20] for i in range(0, len(obc_name_list), self.num_obc_batch) ] for i in range(len(new_list)): scale_noobaa_lib.cleanup(self.namespace, obc_count=new_list[i]) obc_deletion_time = scale_noobaa_lib.measure_obc_deletion_time( obc_name_list=new_list[i]) obc_delete.update(obc_deletion_time) # Store obc creation time on csv file log_path = f"{ocsci_log_path()}/obc-creation" with open(f"{log_path}-{constants.NOOBAA_SC}.csv", "w") as fd: csv_obj = csv.writer(fd) for k, v in obc_create.items(): csv_obj.writerow([k, v]) log.info( f"OBC creation data present in {log_path}-{constants.NOOBAA_SC}.csv" ) # Store obc deletion time on csv file log_path = f"{ocsci_log_path()}/obc-deletion" with open(f"{log_path}-{constants.NOOBAA_SC}.csv", "w") as fd: csv_obj = csv.writer(fd) for k, v in obc_create.items(): csv_obj.writerow([k, v]) log.info( f"OBC deletion data present in {log_path}-{constants.NOOBAA_SC}.csv" )
def test_bulk_clone_performance(self, tmp_path, interface_iterate): """ Creates number of PVCs in a bulk using kube job Write 60% of PVC capacity to each one of the created PVCs Creates 1 clone per each PVC altogether in a bulk Measuring total and csi creation times for bulk of clones """ self.interface = interface_iterate job_pod_file, job_pvc_file, job_clone_file = [None, None, None] log.info(f"Start creating {self.interface} {self.pvc_count} PVC") try: pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=self.pvc_count, access_mode=Interfaces_info[self.interface]["accessmode"], sc_name=Interfaces_info[self.interface]["sc_name"], pvc_size=self.vol_size, ) job_pvc_file = ObjectConfFile( name="job_profile_pvc", obj_dict_list=pvc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_pvc_file.create(namespace=self.namespace) # Check all the PVC reached Bound state performance_lib.wait_for_resource_bulk_status( resource="pvc", resource_count=self.pvc_count, namespace=self.namespace, status=constants.STATUS_BOUND, timeout=120, sleep_time=5, ) log.info( f"All the PVCs ({self.pvc_count}) was created and are in Bound state" ) # Getting the list of the PVC names pvc_bound_list = [ p.name for p in pvc.get_all_pvc_objs(namespace=self.namespace) ] # Kube_job to Create pod log.info( "Attaching PODs to the PVCs and filling them with data (60%)") pod_dict_list = self.attach_pvcs_to_pod_dict(pvc_bound_list) job_pod_file = ObjectConfFile( name="job_profile_pod", obj_dict_list=pod_dict_list, project=self.namespace, tmp_path=tmp_path, ) job_pod_file.create(namespace=self.namespace) # Check all PODs are in Completed state performance_lib.wait_for_resource_bulk_status( resource="pod", resource_count=self.pvc_count, namespace=self.namespace, status=constants.STATUS_COMPLETED, timeout=1200, sleep_time=30, ) log.info("All the PODs completed writing data to the PVC's") clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job( pvc_dict_list, Interfaces_info[self.interface]["clone_yaml"], Interfaces_info[self.interface]["sc_name"], ) log.info("Created clone dict list") csi_bulk_start_time = self.get_time(time_format="csi") job_clone_file = ObjectConfFile( name="job_profile_clone", obj_dict_list=clone_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job that creates clones job_clone_file.create(namespace=self.namespace) log.info("Going to check bound status for clones") # Check all the clones reached Bound state try: performance_lib.wait_for_resource_bulk_status( resource="pvc", resource_count=self.pvc_count * 2, namespace=self.namespace, status=constants.STATUS_BOUND, timeout=1200, sleep_time=30, ) except Exception as ex: log.error("Failed to cvreate clones for PVCs") raise ex log.info( f"All the Clones ({self.pvc_count}) was created and are in Bound state" ) all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace) clone_objs = [ cl for cl in all_pvc_objs if re.match("clone", cl.name) ] for clone_yaml in clone_dict_list: name = clone_yaml["metadata"]["name"] size = clone_yaml["spec"]["resources"]["requests"]["storage"] log.info(f"Clone {name} of size {size} created") start_time = get_provision_time(self.interface, clone_objs, status="start") end_time = get_provision_time(self.interface, clone_objs, status="end") total_time = (end_time - start_time).total_seconds() speed = round(self.total_files_size / total_time, 2) csi_creation_time = performance_lib.csi_bulk_pvc_time_measure( self.interface, clone_objs, "create", csi_bulk_start_time) log.info( f"Total creation time = {total_time} secs, csi creation time = {csi_creation_time}," f" data size = {self.total_files_size} MB, speed = {speed} MB/sec " f"for {self.interface} clone in bulk of {self.pvc_count} clones." ) # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results' doc file. full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "bulk_clone_perf_fullres", )) full_results.add_key("interface", self.interface) full_results.add_key("bulk_size", self.pvc_count) full_results.add_key("clone_size", self.vol_size) full_results.add_key("bulk_creation_time", total_time) full_results.add_key("bulk_csi_creation_time", csi_creation_time) full_results.add_key("data_size(MB)", self.total_files_size) full_results.add_key("speed", speed) full_results.add_key("es_results_link", full_results.results_link()) # Write the test results into the ES server full_results.es_write() self.results_path = get_full_test_logs_path(cname=self) res_link = full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") # Create text file with results of all subtest (3 - according to the parameters) self.write_result_to_file(res_link) # Finally, is used to clean up the resources created # Irrespective of try block pass/fail finally will be executed. finally: # Cleanup activities log.info( "Cleanup of all the resources created during test execution") for object_file in [job_pod_file, job_clone_file, job_pvc_file]: if object_file: object_file.delete(namespace=self.namespace) try: object_file.wait_for_delete( resource_name=object_file.name, namespace=self.namespace) except Exception: log.error(f"{object_file['name']} didnt deleted !") # Check ceph health status utils.ceph_health_check(tries=20)
def test_workload_with_checksum_verify( tmp_path, project, fio_pvc_dict, fio_job_dict, fio_configmap_dict, ): """ Verify that data written by fio during workload storageutilization fixture are still present on the persistent volume. This test case assumes that test case ``test_workload_with_checksum`` (which uses the fixture) has been executed already, and that the PV it created is still around (the PV is identified via it's label, which references the fixture). There is no direct binding between these tests or fixtures, so that one can run ``test_workload_with_checksum`` first, then do some cluster wide temporary distruptive operation such as reboot, temporary shutdown or upgrade, and finally after that run this verification test to check that data are still there. Note/TODO: this test doesn't delete the PV created by the previous test on purpose, so that this test can be executed multiple times (which is important feature of this test, eg. it is possible to run it at different stages of the cluster wide distruptions). We may need to come up with a way to track it and delete it when it's no longer needed though. """ fixture_name = "workload_storageutilization_checksum_rbd" storage_class_name = "ocs-storagecluster-ceph-rbd" pv_label = f'fixture={fixture_name}' # find the volume where the data are stored ocp_pv = ocp.OCP(kind=constants.PV, namespace=project.namespace) logger.info("Searching for PV with label %s, where fio stored data", pv_label) pv_data = ocp_pv.get(selector=pv_label) assert pv_data['kind'] == "List" pv_exists_msg = (f"Single PV with label {pv_label} should exists, " "so that test can identify where to verify the data.") assert len(pv_data['items']) == 1, pv_exists_msg pv_dict = pv_data['items'][0] pv_name = pv_dict['metadata']['name'] logger.info("PV %s was identified, test can continue.", pv_name) # We need to check the PV size so that we can ask for the same via PVC capacity = pv_dict['spec']['capacity']['storage'] logger.info("Capacity of PV %s is %s.", pv_name, capacity) # Convert the storage capacity spec into number of GiB unit = capacity[-2:] assert unit in ("Gi", "Ti"), "PV size should be within reasonable range" if capacity.endswith("Gi"): pvc_size = int(capacity[0:-2]) elif capacity.endswith("Ti"): pvc_size = int(capacity[0:-2]) * 2**10 # And we need to drop claimRef, so that the PV will become available again if "claimRef" in pv_dict['spec']: logger.info("Dropping claimRef from PV %s.", pv_name) patch_success = ocp_pv.patch( resource_name=pv_name, params='[{ "op": "remove", "path": "/spec/claimRef" }]', format_type='json') patch_error_msg = ( "claimRef should be dropped with success, " f"otherwise the test can't continue to reuse PV {pv_name}") assert patch_success, patch_error_msg else: logger.info("PV %s is already without claimRef.", pv_name) # The job won't be running fio, it will run sha1sum check only. container = fio_job_dict['spec']['template']['spec']['containers'][0] container['command'] = [ "/usr/bin/sha1sum", "-c", "/mnt/target/fio.sha1sum" ] # we need to use the same PVC configuration to reuse the PV fio_pvc_dict["spec"]["storageClassName"] = storage_class_name fio_pvc_dict["spec"]["resources"]["requests"]["storage"] = capacity # put the dicts together into yaml file of the Job fio_objs = [fio_pvc_dict, fio_configmap_dict, fio_job_dict] job_file = ObjectConfFile(fixture_name, fio_objs, project, tmp_path) # compute timeout based on the minimal write speed fio_min_mbps = config.ENV_DATA['fio_storageutilization_min_mbps'] job_timeout = fiojob.get_timeout(fio_min_mbps, pvc_size) # expand job timeout because during execution of this test is high # probability that there is more workload executed (from upgrade tests) # that slow down write time # TODO(fbalak): calculate this from actual work being executed job_timeout = job_timeout * 4 # deploy the Job to the cluster and start it job_file.create() # Wait for the job to verify data on the volume. If this fails in any way # the job won't finish with success in given time, and the error message # below will be reported via exception. error_msg = ( "Checksum verification job failed. We weren't able to verify that " "data previously written on the PV are still there.") pod_name = fiojob.wait_for_job_completion(project.namespace, job_timeout, error_msg) # provide clear evidence of the verification in the logs ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace) sha1sum_output = ocp_pod.exec_oc_cmd(f"logs {pod_name}", out_yaml_format=False) logger.info("sha1sum output: %s", sha1sum_output)
def test_multiple_pvc_creation_deletion_scale(self, namespace, tmp_path, access_mode, interface): """ Measuring PVC creation time while scaling PVC Measure PVC deletion time after creation test """ scale_pvc_count = scale_lib.get_max_pvc_count() log.info( f"Start creating {access_mode}-{interface} {scale_pvc_count} PVC") if interface == constants.CEPHBLOCKPOOL: sc_name = constants.DEFAULT_STORAGECLASS_RBD elif interface == constants.CEPHFS_INTERFACE: sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS # Get pvc_dict_list, append all the pvc.yaml dict to pvc_dict_list pvc_dict_list1 = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=int(scale_pvc_count / 2), access_mode=access_mode, sc_name=sc_name) pvc_dict_list2 = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=int(scale_pvc_count / 2), access_mode=access_mode, sc_name=sc_name) # There is 2 kube_job to reduce the load, observed time_out problems # during delete process of single kube_job and heavy load. job_file1 = ObjectConfFile( name="job_profile_1", obj_dict_list=pvc_dict_list1, project=self.namespace, tmp_path=tmp_path, ) job_file2 = ObjectConfFile( name="job_profile_2", obj_dict_list=pvc_dict_list2, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_file1.create(namespace=self.namespace) job_file2.create(namespace=self.namespace) # Check all the PVC reached Bound state pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_file1, namespace=self.namespace, no_of_pvc=int(scale_pvc_count / 2), ) pvc_bound_list.extend( scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_file2, namespace=self.namespace, no_of_pvc=int(scale_pvc_count / 2), )) log.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}") # Get PVC creation time pvc_create_time = helpers.measure_pvc_creation_time_bulk( interface=interface, pvc_name_list=pvc_bound_list, wait_time=300, ) # TODO: Update below code with google API, to record value in spreadsheet # TODO: For now observing Google API limit to write more than 100 writes log_path = f"{ocsci_log_path()}/{interface}-{access_mode}" with open(f"{log_path}-creation-time.csv", "w") as fd: csv_obj = csv.writer(fd) for k, v in pvc_create_time.items(): csv_obj.writerow([k, v]) log.info(f"Create data present in {log_path}-creation-time.csv file") # Get pv_name, require pv_name to fetch deletion time data from log pv_name_list = list() get_kube_job_1 = job_file1.get(namespace=self.namespace) for i in range(int(scale_pvc_count / 2)): pv_name_list.append( get_kube_job_1["items"][i]["spec"]["volumeName"]) get_kube_job_2 = job_file2.get(namespace=self.namespace) for i in range(int(scale_pvc_count / 2)): pv_name_list.append( get_kube_job_2["items"][i]["spec"]["volumeName"]) # Delete kube_job job_file1.delete(namespace=self.namespace) job_file2.delete(namespace=self.namespace) # Adding 1min wait time for PVC deletion logs to be updated # Observed failure when we immediately check the logs for pvc delete time # https://github.com/red-hat-storage/ocs-ci/issues/3371 time.sleep(60) # Get PVC deletion time pvc_deletion_time = helpers.measure_pv_deletion_time_bulk( interface=interface, pv_name_list=pv_name_list) # Update result to csv file. # TODO: Update below code with google API, to record value in spreadsheet # TODO: For now observing Google API limit to write more than 100 writes with open(f"{log_path}-deletion-time.csv", "w") as fd: csv_obj = csv.writer(fd) for k, v in pvc_deletion_time.items(): csv_obj.writerow([k, v]) log.info(f"Delete data present in {log_path}-deletion-time.csv file") end_time = default_timer() log.info(f"Elapsed time -- {end_time - self.start_time} seconds")
def test_bulk_clone_performance(self, namespace, tmp_path): """ Creates number of PVCs in a bulk using kube job Write 60% of PVC capacity to each one of the created PVCs Creates 1 clone per each PVC altogether in a bulk Measuring total and csi creation times for bulk of clones """ pvc_count = 50 vol_size = "5Gi" job_pod_file, job_pvc_file, job_clone_file = [None, None, None] log.info(f"Start creating {self.interface} {pvc_count} PVC") if self.interface == constants.CEPHBLOCKPOOL: sc_name = constants.DEFAULT_STORAGECLASS_RBD clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML elif self.interface == constants.CEPHFILESYSTEM: sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML try: pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=pvc_count, access_mode=constants.ACCESS_MODE_RWO, sc_name=sc_name, pvc_size=vol_size, ) job_pvc_file = ObjectConfFile( name="job_profile_pvc", obj_dict_list=pvc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_pvc_file.create(namespace=self.namespace) # Check all the PVC reached Bound state pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_pvc_file, namespace=self.namespace, no_of_pvc=pvc_count, ) log.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}") # Kube_job to Create pod pod_dict_list = scale_lib.attach_multiple_pvc_to_pod_dict( pvc_list=pvc_bound_list, namespace=self.namespace, pvcs_per_pod=1, start_io=False, pod_yaml=constants.NGINX_POD_YAML, ) job_pod_file = ObjectConfFile( name="job_profile_pod", obj_dict_list=pod_dict_list, project=self.namespace, tmp_path=tmp_path, ) job_pod_file.create(namespace=self.namespace) # Check all PODs in Running state scale_lib.check_all_pod_reached_running_state_in_kube_job( kube_job_obj=job_pod_file, namespace=self.namespace, no_of_pod=len(pod_dict_list), timeout=90, ) log.info(f"Number of PODs in Running state {len(pod_dict_list)}") total_files_size = self.run_fio_on_pvcs(vol_size) clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job( pvc_dict_list, clone_yaml, sc_name) log.info("Created clone dict list") csi_bulk_start_time = self.get_time(time_format="csi") job_clone_file = ObjectConfFile( name="job_profile_clone", obj_dict_list=clone_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job that creates clones job_clone_file.create(namespace=self.namespace) log.info("Going to check bound status for clones") # Check all the clones reached Bound state clone_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_clone_file, namespace=self.namespace, no_of_pvc=pvc_count, timeout=180, ) log.info( f"Number of clones in Bound state {len(clone_bound_list)}") clone_objs = [] all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace) for clone_yaml in clone_dict_list: name = clone_yaml["metadata"]["name"] size = clone_yaml["spec"]["resources"]["requests"]["storage"] log.info(f"Clone {name} of size {size} created") for pvc_obj in all_pvc_objs: if pvc_obj.name == name: clone_objs.append(pvc_obj) assert len(clone_bound_list) == len( clone_objs ), "Not all clones reached BOUND state, cannot measure time" start_time = helpers.get_provision_time(self.interface, clone_objs, status="start") end_time = helpers.get_provision_time(self.interface, clone_objs, status="end") total_time = (end_time - start_time).total_seconds() speed = round(total_files_size / total_time, 2) csi_creation_time = performance_lib.csi_bulk_pvc_time_measure( self.interface, clone_objs, "create", csi_bulk_start_time) log.info( f"Total creation time = {total_time} secs, csi creation time = {csi_creation_time}," f" data size = {total_files_size} MB, speed = {speed} MB/sec " f"for {self.interface} clone in bulk of {pvc_count} clones.") # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "bulk_clone_perf_fullres", )) full_results.add_key("interface", self.interface) full_results.add_key("bulk_size", pvc_count) full_results.add_key("clone_size", vol_size) full_results.add_key("bulk_creation_time", total_time) full_results.add_key("bulk_csi_creation_time", csi_creation_time) full_results.add_key("data_size(MB)", total_files_size) full_results.add_key("speed", speed) full_results.add_key("es_results_link", full_results.results_link()) # Write the test results into the ES server full_results.es_write() self.results_path = get_full_test_logs_path(cname=self) res_link = full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") # Create text file with results of all subtest (3 - according to the parameters) self.write_result_to_file(res_link) # Finally is used to clean-up the resources created # Irrespective of try block pass/fail finally will be executed. finally: # Cleanup activities log.info( "Cleanup of all the resources created during test execution") if job_pod_file: job_pod_file.delete(namespace=self.namespace) job_pod_file.wait_for_delete(resource_name=job_pod_file.name, namespace=self.namespace) if job_clone_file: job_clone_file.delete(namespace=self.namespace) job_clone_file.wait_for_delete( resource_name=job_clone_file.name, namespace=self.namespace) if job_pvc_file: job_pvc_file.delete(namespace=self.namespace) job_pvc_file.wait_for_delete(resource_name=job_pvc_file.name, namespace=self.namespace) # Check ceph health status utils.ceph_health_check(tries=20)
class LogReaderWriterParallel(object): """ Write and read logfile stored on cephfs volume, from all worker nodes of a cluster via k8s Deployment, while fetching content of the stored data via oc rsync to check the data locally. TO DO: Update the test after the issue https://github.com/red-hat-storage/ocs-ci/issues/5724 will be completed. """ def __init__( self, project, tmp_path, storage_size=2, ): """ Init of the LogReaderWriterParallel object Args: project (pytest fixture): The project fixture. tmp_path (pytest fixture): The tmp_path fixture. storage_size (str): The size of the storage in GB. The default value is 2 GB. """ self.project = project self.tmp_path = tmp_path self.pvc_dict = get_pvc_dict() # we need to mount the volume on every worker node, so RWX/cephfs self.pvc_dict["metadata"]["name"] = "logwriter-cephfs-many" self.pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX] if storagecluster_independent_check( ) and not is_managed_service_cluster(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS else: sc_name = constants.CEPHFILESYSTEM_SC logger.info(f"Storage class name = {sc_name}") self.pvc_dict["spec"]["storageClassName"] = sc_name self.pvc_dict["spec"]["resources"]["requests"][ "storage"] = f"{storage_size}Gi" self.deploy_dict = {} self.workload_file = None self.ocp_pod = None self.local_dir = self.tmp_path / "logwriter" self.local_dir.mkdir() def log_reader_writer_parallel(self): """ Write and read logfile stored on cephfs volume, from all worker nodes of a cluster via k8s Deployment. Raise: NotFoundError: When given volume is not found in given spec UnexpectedBehaviour: When an unexpected problem with starting the workload occurred """ # get deployment dict for the reproducer logwriter workload with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file: self.deploy_dict = yaml.safe_load(deployment_file.read()) # if we are running in disconnected environment, we need to mirror the # container image first, and then use the mirror instead of the original if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image( self.deploy_dict["spec"]["template"]) # we need to match deployment replicas with number of worker nodes self.deploy_dict["spec"]["replicas"] = len(get_worker_nodes()) # drop topology spread constraints related to zones topology.drop_topology_constraint( self.deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL) # and link the deployment with the pvc try: link_spec_volume( self.deploy_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", self.pvc_dict["metadata"]["name"], ) except (exceptions.NotFoundError, KeyError) as ex: logger.warning( "Failed to link the deployment with the pvc. We may need to check if the " "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test" ) raise ex # prepare k8s yaml file for deployment self.workload_file = ObjectConfFile( "log_reader_writer_parallel", [self.pvc_dict, self.deploy_dict], self.project, self.tmp_path, ) # deploy the workload, starting the log reader/writer pods logger.info( "starting log reader/writer workload via Deployment, one pod per worker" ) self.workload_file.create() logger.info("waiting for all pods of the workload Deployment to run") self.ocp_pod = ocp.OCP(kind="Pod", namespace=self.project.namespace) try: self.ocp_pod.wait_for_resource( resource_count=self.deploy_dict["spec"]["replicas"], condition=constants.STATUS_RUNNING, error_condition=constants.STATUS_ERROR, timeout=300, sleep=30, ) except Exception as ex: # this is not a problem with feature under test, but with infra, # cluster configuration or unrelated bug which must have happened # before this test case error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken" logger.exception(error_msg) logger.debug(self.workload_file.describe()) raise exceptions.UnexpectedBehaviour(error_msg) from ex def fetch_and_validate_data(self): """ While the workload is running, try to validate the data from the cephfs volume of the workload. Raise: NotFoundError: When the given volume is not found in given spec Exception: When the data verification job failed """ # if no obvious problem was detected, run the logreader job to validate # checksums in the log files (so that we are 100% sure that nothing went # wrong with the IO or the data) with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file: job_dict = yaml.safe_load(job_file.read()) # if we are running in disconnected environment, we need to mirror the # container image first, and then use the mirror instead of the original if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image( self.deploy_dict["spec"]["template"]) # drop topology spread constraints related to zones topology.drop_topology_constraint(job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL) # we need to match number of jobs with the number used in the workload job_dict["spec"]["completions"] = self.deploy_dict["spec"]["replicas"] job_dict["spec"]["parallelism"] = self.deploy_dict["spec"]["replicas"] # and reffer to the correct pvc name try: link_spec_volume( job_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", self.pvc_dict["metadata"]["name"], ) except (exceptions.NotFoundError, KeyError) as ex: logger.warning( "Failed to link the deployment with the pvc. We may need to check if the " "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test" ) raise ex # prepare k8s yaml file for the job job_file = ObjectConfFile("log_reader", [job_dict], self.project, self.tmp_path) # deploy the job, starting the log reader pods logger.info( "starting log reader data validation job to fully check the log data", ) job_file.create() # wait for the logreader job to complete (this should be rather quick) try: job.wait_for_job_completion( job_name=job_dict["metadata"]["name"], namespace=self.project.namespace, timeout=300, sleep_time=30, ) except exceptions.TimeoutExpiredError: error_msg = "verification failed to complete in time: probably data loss or broken cluster" raise Exception(error_msg) # and then check that the job completed with success logger.info("checking the result of data validation job") logger.debug(job_file.describe()) ocp_job = ocp.OCP( kind="Job", namespace=self.project.namespace, resource_name=job_dict["metadata"]["name"], ) job_status = ocp_job.get()["status"] logger.info("last status of data verification job: %s", job_status) if ("failed" in job_status or job_status["succeeded"] != self.deploy_dict["spec"]["replicas"]): error_msg = "possible data corruption: data verification job failed!" logger.error(error_msg) job.log_output_of_job_pods(job_name=job_dict["metadata"]["name"], namespace=self.project.namespace) raise Exception(error_msg)
def setup_netsplit(tmp_path, master_zones, worker_zones, x_addr_list=None, arbiter_zone=None): """ Deploy machineconfig with network split scripts and configuration, tailored for the current cluster state. Args: tmp_path(pathlib.Path): Directory where a temporary yaml file will be created. In test context, use pytest fixture ``tmp_path``. master_zones(list[str]): zones where master nodes are placed worker_zones(list[str]): zones where worker nodes are placed x_addr_list(list[str]): IP addressess of external services (zone x) arbiter_zone(str): name of arbiter zone if arbiter deployment is used Raises: UnexpectedDeploymentConfiguration: in case of invalid cluster configuration, which prevents deployment of network split scripts ValueError: in case given zone configuration doesn't make any sense """ logger.info("going to deploy ocpnetsplit scripts") # checking assumptions: each node has a zone label if not are_zone_labels_present(): msg = "to use network_split_setup, all nodes needs a zone label" logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg) # check zone assummtions: all worker zones are master zones as well worker_zones_without_master = set(worker_zones).difference( set(master_zones)) if len(worker_zones_without_master) != 0: msg = ("there are zones which contains worker nodes, " f"but no master nodes: {worker_zones_without_master}") logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg) if (arbiter_zone is not None) and (arbiter_zone not in master_zones): msg = "given arbiter zone not found among master zones" logger.error(msg) raise ValueError(msg) if len(master_zones) == 3: zone_a, zone_b, zone_c = master_zones # handle arbiter (so that zone a is always arbiter) if specified if arbiter_zone is not None: zone_a = arbiter_zone other_zones = master_zones.copy() other_zones.remove(arbiter_zone) zone_b, zone_c = other_zones else: msg = "ocpnetsplit can handle only 3 zones, setup can't continue" logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg) # we assume that there are just 2 machine config pools: master and worker mcp_h = OCP(kind="MachineConfigPool", namespace="openshift-config") mcp_objects = mcp_h.get() mcp_names = [i["metadata"]["name"] for i in mcp_objects["items"]] if len(mcp_names) != 2: msg = ("ocpnetsplit can handle only 2 machine config pools, " f"but there are {mcp_names}") logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg) for exp_pool in ("master", "worker"): if exp_pool not in mcp_names: msg = f"MachineConfigPool/{exp_pool} not found" logger.error(msg) raise exceptions.UnexpectedDeploymentConfiguration(msg) # generate zone config (list of node ip addressess for each zone) zone_config = ocpnetsplit.main.get_zone_config(zone_a, zone_b, zone_c, x_addr_list) zone_env = zone_config.get_env_file() # get machinecofnig for network split firewall scripts mc = ocpnetsplit.main.get_networksplit_mc_spec(zone_env) # deploy it within openshift-config namespace mc_file = ObjectConfFile("network-split", mc, None, tmp_path) mc_file.create(namespace="openshift-config") # now let's make sure the MCO (machine config operator) noticed just # deployed network-split machine config and started to process it logger.info("waiting for both machineconfigpools to be updating " "as a result of deployment of network-split machineconfig") mcp_h.wait_for_resource( resource_count=2, condition="True", column="UPDATING", sleep=5, timeout=120, ) # and now wait for MachineConfigPools to be updated and ready logger.info("waiting for both machineconfigpools to be updated and ready") mcp_h.wait_for_resource( resource_count=2, condition="True", column="UPDATED", sleep=60, timeout=1800, ) # also check that no pools are degraded mcp_h.wait_for_resource( resource_count=2, condition="False", column="DEGRADED", sleep=10, timeout=120, )