def wait_for_job_completion(job_name, namespace, timeout=600, sleep_time=30): """ Wait for given k8s Job to complete. Args: job_name (str): name of the job to wait for namespace (str): name of the namespace where the job is running timeout (int): timeout in seconds sleep_time (int): sleep time between consequent job status checks in seconds Raises: TimeoutExpiredError: When job fails to complete in given time """ ocp_job = OCP(kind="Job", namespace=namespace, resource_name=job_name) try: for live_job_d in TimeoutIterator( timeout=timeout, sleep=sleep_time, func=ocp_job.get ): job_status = live_job_d.get("status") if job_status is None: log.debug("job status not (yet) available") continue if "completionTime" in job_status: log.info( "job %s finished at %s", job_name, job_status["completionTime"] ) break except exceptions.TimeoutExpiredError as ex: error_msg = f"job/{job_name} failed to complete in {timeout} seconds" log.warning(error_msg) raise exceptions.TimeoutExpiredError(error_msg) from ex
def wait_for_wl_to_finish(self, timeout=18000, sleep=300): """ Waiting until the workload is finished and get the test log Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds Raise: exception for too much restarts of the test. ResourceWrongStatusException : test Failed / Error TimeoutExpiredError : test did not completed on time. """ log.info(f"Waiting for {self.client_pod_name} to complete") Finished = 0 restarts = 0 total_time = timeout while not Finished and total_time > 0: results = run_oc_command( "get pod --no-headers -o custom-columns=:metadata.name,:status.phase", namespace=benchmark_operator.BMO_NAME, ) (fname, status) = ["", ""] for name in results: # looking for the pod which run the benchmark (not the IO) # this pod contain the `client` in his name, and there is only one # pod like this, other pods have the `server` in the name. (fname, status) = name.split() if re.search("client", fname): break else: (fname, status) = ["", ""] if fname == "": # there is no `client` pod ! err_msg = f"{self.client_pod} Failed to run !!!" log.error(err_msg) raise Exception(err_msg) if not fname == self.client_pod: # The client pod name is different from previous check, it was restarted log.info( f"The pod {self.client_pod} was restart. the new client pod is {fname}" ) self.client_pod = fname restarts += 1 # in case of restarting the benchmark, reset the timeout as well total_time = timeout if restarts > 3: # we are tolerating only 3 restarts err_msg = f"Too much restarts of the benchmark ({restarts})" log.error(err_msg) raise Exception(err_msg) if status == "Succeeded": # Getting the end time of the benchmark - for reporting. self.end_time = self.get_time() self.test_logs = self.pod_obj.exec_oc_cmd( f"logs {self.client_pod}", out_yaml_format=False ) log.info(f"{self.client_pod} completed successfully") Finished = 1 elif ( status != constants.STATUS_RUNNING and status != constants.STATUS_PENDING ): # if the benchmark pod is not in Running state (and not Completed/Pending), # no need to wait for timeout. # Note: the pod can be in pending state in case of restart. err_msg = f"{self.client_pod} Failed to run - ({status})" log.error(err_msg) raise exceptions.ResourceWrongStatusException( self.client_pod, describe_out=err_msg, column="Status", expected="Succeeded", got=status, ) else: log.info( f"{self.client_pod} is in {status} State, and wait to Succeeded State." f" wait another {sleep} sec. for benchmark to complete" ) time.sleep(sleep) total_time -= sleep if not Finished: err_msg = ( f"{self.client_pod} did not completed on time, " f"maybe timeout ({timeout}) need to be increase" ) log.error(err_msg) raise exceptions.TimeoutExpiredError( self.client_pod, custom_message=err_msg ) # Saving the benchmark internal log into a file at the logs directory log_file_name = f"{self.full_log_path}/test-pod.log" try: with open(log_file_name, "w") as f: f.write(self.test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning(f"Cannot write the log to the file {log_file_name}") log.info(f"The {self.benchmark_name} benchmark complete")