def read_logs(self, kind, namespace, start_time): """ Reading the csi-driver logs, since we use different logs for the start time for end time (creation snapshot), we call this function twice. Args: kind (str): the kind of logs to read 'start' or 'end' namespace (str): in which namespace the pod exists start_time (time): the start time of the specific test, so we dont need to read the full log Returns: list : the contant of all read logs(s) - can be more then one log """ logs = [] # The pod with the logs for 'start' creation time have only one container container = "" if kind == "end": # The pod with the logs for 'end' creation time have more then one container container = "-c csi-snapshotter" for l in self.log_names[kind]: logs.append( run_oc_command( f"logs {l} {container} --since-time={start_time}", namespace=namespace, )) return logs
def wait_for_clone_creation(self, clone_name, timeout=600): """ Waits for creation of clone for defined period of time Raises exception and fails the test if clone was not created during that time Args: clone_name: name of the clone being created timeout: optional argument, time period in seconds to wait for creation """ while timeout > 0: res = performance_lib.run_oc_command( f"get pvc {clone_name} -o yaml", self.params["nspace"]) if self.params["ERRMSG"] not in res[0]: res = yaml.safe_load("\n".join(res)) log.info(f"Result yaml is {res}") if res["status"]["phase"] == "Bound": log.info(f"{clone_name} Created and ready to use") break else: log.info( f"{clone_name} is not ready yet, sleep 5 sec before re-check" ) time.sleep(5) timeout -= 5 else: raise Exception(f"Can not get clone status {res}") if timeout <= 0: raise Exception( f"Clone {clone_name} for {self.interface} interface was not created for 600 seconds" )
def cleanup_testing_pvc(self): try: pv = self.pvc_obj.get("spec")["spec"]["volumeName"] self.pvc_obj.delete() # Wait for the PVC to be deleted performance_lib.wait_for_resource_bulk_status( "pvc", 0, self.namespace, constants.STATUS_BOUND, 60, 5 ) log.info("The PVC was deleted successfully") except Exception: log.warning("The PVC failed to delete") pass # Delete the backend PV of the PVC try: log.info(f"Try to delete the backend PV : {pv}") performance_lib.run_oc_command(f"delete pv {pv}") except Exception as ex: err_msg = f"cannot delete PV [{ex}]" log.error(err_msg)
def build_params(self): log.info("Start building params") self.params["nspace"] = self.pvc_obj.namespace self.params["pvcname"] = self.pvc_obj.name log_file_name = os.path.basename(__file__).replace(".py", ".log") full_log = f"{ocsci_log_path()}/{log_file_name}" logging.basicConfig( filename=full_log, level=logging.INFO, format=constants.LOG_FORMAT ) self.params["datasize"] = int(self.params["filesize"].replace("M", "")) self.params["clone_yaml"] = constants.CSI_CEPHFS_PVC_CLONE_YAML if self.interface == constants.CEPHBLOCKPOOL: self.params["clone_yaml"] = constants.CSI_RBD_PVC_CLONE_YAML output = performance_lib.run_oc_command( cmd=f"get pod {self.pod_obj.name} -o yaml", namespace=self.params["nspace"] ) results = yaml.safe_load("\n".join(output)) self.params["path"] = results["spec"]["containers"][0]["volumeMounts"][0][ "mountPath" ] log.info(f"path - {self.params['path']}") fd, tmpfile = tempfile.mkstemp(suffix=".yaml", prefix="Clone") self.params["tmpfile"] = tmpfile # reading template of clone yaml file with open(self.params["clone_yaml"], "r") as stream: try: clone_yaml = yaml.safe_load(stream) clone_yaml["spec"]["storageClassName"] = self.pvc_obj.backed_sc clone_yaml["spec"]["dataSource"]["name"] = self.params["pvcname"] clone_yaml["spec"]["resources"]["requests"]["storage"] = ( str(self.pvc_obj.size) + "Gi" ) except yaml.YAMLError as exc: log.error(f"Can not read template yaml file {exc}") log.info( f'Clone yaml file : {self.params["clone_yaml"]} ' f"Content of clone yaml file {clone_yaml}" ) return clone_yaml
def get_csi_pod(self, namespace): """ Getting pod list in specific namespace, for the provision logs Args: namespace (str): the namespace where the pod is deployed. Returns: list : list of lines from the output of the command. """ results = run_oc_command(cmd="get pod", namespace=namespace) if ERRMSG in results: err_msg = "Can not get the CSI controller pod" log.error(err_msg) raise Exception(err_msg) return results
def create_clone(self, clone_num, clone_yaml): """ Creating clone for pvc, measure the creation time Args: clone_num (int) the number of clones to create clone_yaml : a template of clone yaml Returns: int: the creation time of the clone (in secs.) """ log.info( f"Creating clone number {clone_num} for interface {self.interface}" ) clone_name = f"pvc-clone-{clone_num}-" clone_name += self.params["pvcname"].split("-")[-1] clone_yaml["metadata"]["name"] = clone_name tmpfile = self.params["tmpfile"] log.info(f"Going to create {tmpfile}") with open(tmpfile, "w") as f: yaml.dump(clone_yaml, f, default_flow_style=False) start_time = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") log.info(f"Clone yaml file is {clone_yaml}") res = performance_lib.run_oc_command(f"create -f {tmpfile}", self.params["nspace"]) if self.params["ERRMSG"] in res[0]: raise Exception(f"Can not create clone : {res}") # wait until clone is ready self.wait_for_clone_creation(clone_name) create_time = performance_lib.measure_pvc_creation_time( self.interface, clone_name, start_time) log.info(f"Creation time of clone {clone_name} is {create_time} secs.") return create_time
def measure_pod_creation_time(namespace, num_of_pods): """ Measures and Logs the POD Creation Time of all the PODs. Args: namespace (str) : Namespace in which the PODs are created. num_of_pods (int) : Number of PODs created. Logs: POD Creation Time of all the PODs. """ logs = performance_lib.run_oc_command( "get events --sort-by='{.lastTimestamp}'", namespace, ) scheduled_time = None pod_no = num_of_pods accepted_creation_time = 12 for line in logs: log.info(line) if "Scheduled" in line: scheduled_time = int(line.split()[0][:-1]) elif "Created" in line: created_time = int(line.split()[0][:-1]) creation_time = scheduled_time - created_time if creation_time <= accepted_creation_time: log.info( f"POD number {pod_no} was created in {creation_time} seconds." ) else: log.error( f"POD creation time is {creation_time} and is greater than " f"{accepted_creation_time} seconds.") pod_no -= 1
def wait_for_wl_to_finish(self, timeout=18000, sleep=300): """ Waiting until the workload is finished and get the test log Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds Raise: exception for too much restarts of the test. ResourceWrongStatusException : test Failed / Error TimeoutExpiredError : test did not completed on time. """ log.info(f"Waiting for {self.client_pod_name} to complete") Finished = 0 restarts = 0 total_time = timeout while not Finished and total_time > 0: results = run_oc_command( "get pod --no-headers -o custom-columns=:metadata.name,:status.phase", namespace=benchmark_operator.BMO_NAME, ) (fname, status) = ["", ""] for name in results: # looking for the pod which run the benchmark (not the IO) # this pod contain the `client` in his name, and there is only one # pod like this, other pods have the `server` in the name. (fname, status) = name.split() if re.search("client", fname): break else: (fname, status) = ["", ""] if fname == "": # there is no `client` pod ! err_msg = f"{self.client_pod} Failed to run !!!" log.error(err_msg) raise Exception(err_msg) if not fname == self.client_pod: # The client pod name is different from previous check, it was restarted log.info( f"The pod {self.client_pod} was restart. the new client pod is {fname}" ) self.client_pod = fname restarts += 1 # in case of restarting the benchmark, reset the timeout as well total_time = timeout if restarts > 3: # we are tolerating only 3 restarts err_msg = f"Too much restarts of the benchmark ({restarts})" log.error(err_msg) raise Exception(err_msg) if status == "Succeeded": # Getting the end time of the benchmark - for reporting. self.end_time = self.get_time() self.test_logs = self.pod_obj.exec_oc_cmd( f"logs {self.client_pod}", out_yaml_format=False ) log.info(f"{self.client_pod} completed successfully") Finished = 1 elif ( status != constants.STATUS_RUNNING and status != constants.STATUS_PENDING ): # if the benchmark pod is not in Running state (and not Completed/Pending), # no need to wait for timeout. # Note: the pod can be in pending state in case of restart. err_msg = f"{self.client_pod} Failed to run - ({status})" log.error(err_msg) raise exceptions.ResourceWrongStatusException( self.client_pod, describe_out=err_msg, column="Status", expected="Succeeded", got=status, ) else: log.info( f"{self.client_pod} is in {status} State, and wait to Succeeded State." f" wait another {sleep} sec. for benchmark to complete" ) time.sleep(sleep) total_time -= sleep if not Finished: err_msg = ( f"{self.client_pod} did not completed on time, " f"maybe timeout ({timeout}) need to be increase" ) log.error(err_msg) raise exceptions.TimeoutExpiredError( self.client_pod, custom_message=err_msg ) # Saving the benchmark internal log into a file at the logs directory log_file_name = f"{self.full_log_path}/test-pod.log" try: with open(log_file_name, "w") as f: f.write(self.test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning(f"Cannot write the log to the file {log_file_name}") log.info(f"The {self.benchmark_name} benchmark complete")
def teardown(self): """ Cleaning up the environment : Delete all snapshot Delete the POD Delete the PVC and the PV Delete the StorageClass Delete the VolumeSnapshotClass Delete the data pool Switch to the default namespace Delete the tested namespace """ log.info("Cleanup the test environment") if self.full_teardown: # Getting the name of the PCV's backed PV try: pv = self.pvc_obj.get("spec")["spec"]["volumeName"] except KeyError: log.error( f"Cannot found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}" ) # Getting the list of all snapshots try: snapshot_list = self.snapshot.get(all_namespaces=True)["items"] except Exception as err: log.error(f"Cannot get the list of snapshots : {err}") snapshot_list = [] # Deleting al snapshots from the cluster log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots") log.debug( f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}" ) for vs in snapshot_list: snap_name = vs["metadata"]["name"] log.info(f"Try to delete {snap_name}") try: self.snapshot.delete(resource_name=snap_name) except Exception as err: log.error(f"Cannot delete {snap_name} : {err}") # Deleting the pod which wrote data to the pvc log.info(f"Deleting the test POD : {self.pod_obj.name}") try: self.pod_obj.delete() log.info("Wait until the pod is deleted.") self.pod_obj.ocp.wait_for_delete( resource_name=self.pod_obj.name) except Exception as ex: log.error(f"Cannot delete the test pod : {ex}") # Deleting the PVC which used in the test. try: log.info(f"Delete the PVC : {self.pvc_obj.name}") self.pvc_obj.delete() log.info("Wait until the pvc is deleted.") self.pvc_obj.ocp.wait_for_delete( resource_name=self.pvc_obj.name) except Exception as ex: log.error(f"Cannot delete the test pvc : {ex}") # Delete the backend PV of the PVC log.info(f"Try to delete the backend PV : {pv}") try: run_oc_command(f"delete pv {pv}") except Exception as ex: err_msg = f"cannot delete PV {pv} - [{ex}]" log.error(err_msg) # Deleting the StorageClass used in the test log.info(f"Deleting the test StorageClass : {self.sc_obj.name}") try: self.sc_obj.delete() log.info("Wait until the SC is deleted.") self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name) except Exception as ex: log.error(f"Can not delete the test sc : {ex}") # Deleting the VolumeSnapshotClass used in the test log.info( f"Deleting the test Snapshot Class : {self.snap_class.name}") try: self.snap_class.delete() log.info("Wait until the VSC is deleted.") self.snap_class.ocp.wait_for_delete( resource_name=self.snap_class.name) except Exception as ex: log.error(f"Can not delete the test vsc : {ex}") # Deleting the Data pool log.info(f"Deleting the test storage pool : {self.sc_name}") self.delete_ceph_pool(self.sc_name) # Verify deletion by checking the backend CEPH pools using the toolbox results = self.ceph_cluster.toolbox.exec_cmd_on_pod( "ceph osd pool ls") log.debug(f"Existing pools are : {results}") if self.sc_name in results.split(): log.warning( "The pool did not deleted by CSI, forcing delete it manually" ) self.ceph_cluster.toolbox.exec_cmd_on_pod( f"ceph osd pool delete {self.sc_name} {self.sc_name} " "--yes-i-really-really-mean-it") else: log.info(f"The pool {self.sc_name} was deleted successfully") # Deleting the namespace used by the test log.info(f"Deleting the test namespace : {self.nss_name}") switch_to_default_rook_cluster_project() try: self.proj.delete(resource_name=self.nss_name) self.proj.wait_for_delete(resource_name=self.nss_name, timeout=60, sleep=10) except CommandFailed: log.error(f"Can not delete project {self.nss_name}") raise CommandFailed(f"{self.nss_name} was not created") # After deleting all data from the cluster, we need to wait until it will re-balance ceph_health_check(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, tries=30, delay=60) super(TestPvcMultiSnapshotPerformance, self).teardown()
def create_snapshot(self, snap_num): """ Creating snapshot of volume measure the total snapshot creation time and the CSI creation time Args: snap_num (int) the number of snapshot to create Returns: int: the creation time of the snapshot (in sec.) """ log.info(f"Taking snapshot number {snap_num}") # Getting UTC time before test starting for log retrieve start_time = self.get_time("csi") snap_name = f"pvc-snap-{snap_num}-" snap_name += self.pvc_obj.name.split("-")[-1] self.snap_templ["metadata"]["name"] = snap_name self.snap_templ["spec"][ "volumeSnapshotClassName"] = self.snap_class.name fd, tmpfile = tempfile.mkstemp(suffix=".yaml", prefix="Snap") log.debug(f"Going to create {tmpfile}") with open(tmpfile, "w") as f: yaml.dump(self.snap_templ, f, default_flow_style=False) res = run_oc_command(cmd=f"create -f {tmpfile}", namespace=self.nss_name) if ERRMSG in res[0]: err_msg = f"Failed to create snapshot : {res}" log.error(err_msg) raise Exception(err_msg) # wait until snapshot is ready timeout = 720 sleep_time = 10 snap_con_name = None snap_uid = None while timeout > 0: res = run_oc_command(f"get volumesnapshot {snap_name} -o yaml", namespace=self.nss_name) if ERRMSG not in res[0]: res = yaml.safe_load("\n".join(res)) log.debug(f"The command output is : {yaml.dump(res)}") try: if res["status"]["readyToUse"]: log.info(f"{snap_name} Created and ready to use") snap_con_name = res["status"][ "boundVolumeSnapshotContentName"] snap_uid = res["metadata"]["uid"] break else: log.info( f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check" ) time.sleep(sleep_time) timeout -= sleep_time except Exception: log.info( f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check" ) time.sleep(sleep_time) timeout -= sleep_time else: err_msg = f"Can not get snapshot status {res}" log.error(err_msg) raise Exception(err_msg) if snap_con_name: creation_time = performance_lib.measure_total_snapshot_creation_time( snap_name, start_time) csi_creation_time = performance_lib.measure_csi_snapshot_creation_time( self.interface, snap_uid, start_time) return (creation_time, csi_creation_time) else: err_msg = "Snapshot was not created on time" log.error(err_msg) raise TimeoutError(err_msg)
def test_pvc_creation_deletion_measurement_performance( self, interface_type, pvc_size): """ Measuring PVC creation and deletion times for pvc samples. filling up each PVC with 70% of data. Verifying that those times are within the required limits Args: interface_type (str): the interface type to run against - CephBlockPool or CephFileSystem pvc_size (str): the size of the pvc to create """ # Initializing test variables self.interface = interface_type num_of_samples = 5 if self.dev_mode: num_of_samples = 2 accepted_creation_time = 1 accepted_deletion_time = Interface_Info[self.interface]["delete_time"] accepted_creation_deviation_percent = 50 accepted_deletion_deviation_percent = 50 all_mesuring_times = { "create": [], "delete": [], "csi_create": [], "csi_delete": [], } msg_prefix = f"Interface: {self.interface}, PVC size: {pvc_size}." self.set_results_path_and_file( "test_pvc_creation_deletion_measurement_performance") self.start_time = self.get_time() self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_create_delete_fullres", )) self.full_results.add_key("pvc_size", pvc_size) self.full_results.add_key("samples", num_of_samples) self.create_fio_pod_yaml(pvc_size=int(pvc_size.replace("Gi", ""))) # Creating PVC(s) for creation time mesurment start_time = self.create_pvcs_and_wait_for_bound(msg_prefix, num_of_samples, pvc_size, burst=False) # Fillup the PVC with data (70% of the total PVC size) self.run_io() # Deleting PVC(s) for deletion time mesurment log.info("Try to delete all created PVCs") for pvc_obj in self.pvc_objs: pvc_obj.delete() log.info("Wait for all PVC(s) to be deleted") performance_lib.wait_for_resource_bulk_status("pvc", 0, self.namespace, constants.STATUS_BOUND, num_of_samples * 2, 5) log.info("All PVC(s) was deleted") mesure_data = "create" rec_policy = performance_lib.run_oc_command( f'get sc {Interface_Info[self.interface]["sc"]} -o jsonpath="' + '{.reclaimPolicy}"') if rec_policy[0].strip('"') == constants.RECLAIM_POLICY_DELETE: log.info("Wait for all PVC(s) backed PV(s) to be deleted") # Timeout for each PV to be deleted is 20 sec. performance_lib.wait_for_resource_bulk_status( "pv", 0, self.namespace, self.namespace, num_of_samples * 20, 5) log.info("All backed PV(s) was deleted") mesure_data = "all" # Mesuring the time it took to create and delete the PVC(s) log.info("Reading Creation/Deletion time from provisioner logs") self.results_times = performance_lib.get_pvc_provision_times( interface=self.interface, pvc_name=self.pvc_objs, start_time=start_time, time_type="all", op=mesure_data, ) # Analaysing the test results for i, pvc_res in enumerate(self.results_times): data = self.results_times[pvc_res] msg = f"{msg_prefix} PVC number {i + 1} was" for op in Operations_Mesurment: log.info(f"{msg} {op}d in {data[op]['time']} seconds.") if data["create"]["time"] > accepted_creation_time: raise ex.PerformanceException( f"{msg_prefix} PVC creation time is {data['create']['time']} and is greater than " f"{accepted_creation_time} seconds.") if rec_policy == constants.RECLAIM_POLICY_DELETE: if data["delete"]["time"] > accepted_deletion_time: raise ex.PerformanceException( f"{msg_prefix} PVC deletion time is {data['delete']['time']} and is greater than " f"{accepted_deletion_time} seconds.") all_mesuring_times["delete"].append(data["delete"]["time"]) all_mesuring_times["csi_delete"].append( data["csi_delete"]["time"]) all_mesuring_times["create"].append(data["create"]["time"]) all_mesuring_times["csi_create"].append(data["csi_create"]["time"]) for op in Operations_Mesurment: if rec_policy == constants.RECLAIM_POLICY_DELETE and "del" in op: self.process_time_measurements( op, all_mesuring_times[op], accepted_deletion_deviation_percent, msg_prefix, ) if "create" in op: self.process_time_measurements( op, all_mesuring_times[op], accepted_creation_deviation_percent, msg_prefix, ) self.full_results.all_results = self.results_times self.end_time = self.get_time() self.full_results.add_key("test_time", { "start": self.start_time, "end": self.end_time }) if self.full_results.es_write(): res_link = self.full_results.results_link() log.info(f"The Result can be found at : {res_link}") # Create text file with results of all subtest (6 - according to the parameters) self.write_result_to_file(res_link)
def wait_for_wl_to_finish(self, timeout=18000, sleep=300): """ Waiting until the workload is finished and get the test log Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds Raise: exception for too much restarts of the test. """ log.info(f"Waiting for {self.client_pod_name} to complete") Finished = 0 restarts = 0 while not Finished: results = run_oc_command( "get pod --no-headers -o custom-columns=:metadata.name,:status.phase", namespace="my-ripsaw", ) fname = "" for name in results: if re.search(self.client_pod_name, name): (fname, status) = name.split() continue if not fname == self.client_pod: log.info( f"The pod {self.client_pod} was restart. the new client pod is {fname}" ) self.client_pod = fname restarts += 1 if restarts > 3: err_msg = f"Too much restarts of the benchmark ({restarts})" log.error(err_msg) raise Exception(err_msg) if status == "Succeeded": self.end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) self.test_logs = self.pod_obj.exec_oc_cmd( f"logs {self.client_pod}", out_yaml_format=False) log.info(f"{self.client_pod} completed successfully") Finished = 1 else: log.info( f"{self.client_pod} is in {status} State, and wait to Succeeded State." f" wait another {sleep} sec. for benchmark to complete") time.sleep(sleep) self.pod_obj.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=self.client_pod, timeout=timeout, sleep=sleep, ) # Getting the end time of the benchmark - for reporting. self.end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) self.test_logs = self.pod_obj.exec_oc_cmd(f"logs {self.client_pod}", out_yaml_format=False) # Saving the benchmark internal log into a file at the logs directory log_file_name = f"{self.full_log_path}/test-pod.log" try: with open(log_file_name, "w") as f: f.write(self.test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning(f"Cannot write the log to the file {log_file_name}") log.info(f"The {self.benchmark_name} benchmark complete")
def teardown(self): """ Cleanup the test environment """ log.info("Starting the test cleanup") # Delete The test POD self.cleanup_testing_pod() # Delete the created clones try: for clone in self.cloned_obj_list: performance_lib.run_oc_command(cmd=f"delete pvc {clone}", namespace=self.namespace) except Exception: log.warning("Clones didn't deleted") # Delete the test PVC self.cleanup_testing_pvc() # Deleting the namespace used by the test self.delete_test_project() # Delete the test StorageClass try: log.info(f"Deleting the test StorageClass : {self.sc_obj.name}") self.sc_obj.delete() log.info("Wait until the SC is deleted.") self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name) except Exception as ex: log.error(f"Can not delete the test sc : {ex}") # Delete the test storage pool if self.interface == constants.CEPHBLOCKPOOL: log.info(f"Try to delete the Storage pool {self.pool_name}") try: self.delete_ceph_pool(self.pool_name) except Exception: pass finally: # Verify deletion by checking the backend CEPH pools using the toolbox results = self.ceph_cluster.toolbox.exec_cmd_on_pod( "ceph osd pool ls") log.debug(f"Existing pools are : {results}") if self.pool_name in results.split(): log.warning( "The pool did not deleted by CSI, forcing delete it manually" ) self.ceph_cluster.toolbox.exec_cmd_on_pod( f"ceph osd pool delete {self.pool_name} {self.pool_name} " "--yes-i-really-really-mean-it") else: log.info( f"The pool {self.pool_name} was deleted successfully") else: log.info("Deleting the default ceph filesystem") self.ceph_cluster.CEPHFS.delete( resource_name="ocs-storagecluster-cephfilesystem") log.info("Wait until the ceph filesystem re-created") time.sleep(60) log.info("re-create the ceph filesystem csi subvolumegroup") self.ceph_cluster.toolbox.exec_cmd_on_pod( "ceph fs subvolumegroup create ocs-storagecluster-cephfilesystem csi" ) super(TestPvcMultiClonePerformance, self).teardown()