Exemple #1
0
def wait_for_job_completion(job_name, namespace, timeout=600, sleep_time=30):
    """
    Wait for given k8s Job to complete.

    Args:
        job_name (str): name of the job to wait for
        namespace (str): name of the namespace where the job is running
        timeout (int): timeout in seconds
        sleep_time (int): sleep time between consequent job status checks in
            seconds

    Raises:
        TimeoutExpiredError: When job fails to complete in given time
    """
    ocp_job = OCP(kind="Job", namespace=namespace, resource_name=job_name)
    try:
        for live_job_d in TimeoutIterator(
            timeout=timeout, sleep=sleep_time, func=ocp_job.get
        ):
            job_status = live_job_d.get("status")
            if job_status is None:
                log.debug("job status not (yet) available")
                continue
            if "completionTime" in job_status:
                log.info(
                    "job %s finished at %s", job_name, job_status["completionTime"]
                )
                break
    except exceptions.TimeoutExpiredError as ex:
        error_msg = f"job/{job_name} failed to complete in {timeout} seconds"
        log.warning(error_msg)
        raise exceptions.TimeoutExpiredError(error_msg) from ex
Exemple #2
0
    def wait_for_wl_to_finish(self, timeout=18000, sleep=300):
        """
        Waiting until the workload is finished and get the test log

        Args:
            timeout (int): time in second to wait until the benchmark start
            sleep (int): Sleep interval seconds

        Raise:
            exception for too much restarts of the test.
            ResourceWrongStatusException : test Failed / Error
            TimeoutExpiredError : test did not completed on time.

        """
        log.info(f"Waiting for {self.client_pod_name} to complete")

        Finished = 0
        restarts = 0
        total_time = timeout
        while not Finished and total_time > 0:
            results = run_oc_command(
                "get pod --no-headers -o custom-columns=:metadata.name,:status.phase",
                namespace=benchmark_operator.BMO_NAME,
            )
            (fname, status) = ["", ""]
            for name in results:
                # looking for the pod which run the benchmark (not the IO)
                # this pod contain the `client` in his name, and there is only one
                # pod like this, other pods have the `server` in the name.
                (fname, status) = name.split()
                if re.search("client", fname):
                    break
                else:
                    (fname, status) = ["", ""]

            if fname == "":  # there is no `client` pod !
                err_msg = f"{self.client_pod} Failed to run !!!"
                log.error(err_msg)
                raise Exception(err_msg)

            if not fname == self.client_pod:
                # The client pod name is different from previous check, it was restarted
                log.info(
                    f"The pod {self.client_pod} was restart. the new client pod is {fname}"
                )
                self.client_pod = fname
                restarts += 1
                # in case of restarting the benchmark, reset the timeout as well
                total_time = timeout

            if restarts > 3:  # we are tolerating only 3 restarts
                err_msg = f"Too much restarts of the benchmark ({restarts})"
                log.error(err_msg)
                raise Exception(err_msg)

            if status == "Succeeded":
                # Getting the end time of the benchmark - for reporting.
                self.end_time = self.get_time()
                self.test_logs = self.pod_obj.exec_oc_cmd(
                    f"logs {self.client_pod}", out_yaml_format=False
                )
                log.info(f"{self.client_pod} completed successfully")
                Finished = 1
            elif (
                status != constants.STATUS_RUNNING
                and status != constants.STATUS_PENDING
            ):
                # if the benchmark pod is not in Running state (and not Completed/Pending),
                # no need to wait for timeout.
                # Note: the pod can be in pending state in case of restart.
                err_msg = f"{self.client_pod} Failed to run - ({status})"
                log.error(err_msg)
                raise exceptions.ResourceWrongStatusException(
                    self.client_pod,
                    describe_out=err_msg,
                    column="Status",
                    expected="Succeeded",
                    got=status,
                )
            else:
                log.info(
                    f"{self.client_pod} is in {status} State, and wait to Succeeded State."
                    f" wait another {sleep} sec. for benchmark to complete"
                )
                time.sleep(sleep)
                total_time -= sleep

        if not Finished:
            err_msg = (
                f"{self.client_pod} did not completed on time, "
                f"maybe timeout ({timeout}) need to be increase"
            )
            log.error(err_msg)
            raise exceptions.TimeoutExpiredError(
                self.client_pod, custom_message=err_msg
            )

        # Saving the benchmark internal log into a file at the logs directory
        log_file_name = f"{self.full_log_path}/test-pod.log"
        try:
            with open(log_file_name, "w") as f:
                f.write(self.test_logs)
            log.info(f"The Test log can be found at : {log_file_name}")
        except Exception:
            log.warning(f"Cannot write the log to the file {log_file_name}")
        log.info(f"The {self.benchmark_name} benchmark complete")