Esempio n. 1
0
    def check_status(self, name=None, job_id=None):
        field_names = ("jobid", "name", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"
        if name is not None:
            cmd += f" -n {name}"
        elif job_id is not None:
            cmd += f" -j {job_id}"
        else:
            # Mutual exclusivity should be handled in HpcManager.
            assert False

        output = {}
        ret = run_command(cmd, output)
        if ret != 0:
            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        stdout = output["stdout"]
        logger.debug("squeue output:  [%s]", stdout)
        fields = stdout.split()
        if not fields:
            # No jobs are currently running.
            return HpcJobInfo("", "", HpcJobStatus.NONE)

        assert len(fields) == len(field_names)
        job_info = HpcJobInfo(fields[0],
                              fields[1],
                              self._STATUSES.get(fields[2],
                                                 HpcJobStatus.UNKNOWN))
        return job_info
Esempio n. 2
0
    def check_status(self, name=None, job_id=None):
        if self._subprocess_mgr is None:
            job_info = HpcJobInfo(job_id, "", HpcJobStatus.NONE)
        elif self._subprocess_mgr.in_progress():
            job_info = HpcJobInfo(job_id, "", HpcJobStatus.RUNNING)
        else:
            job_info = HpcJobInfo(job_id, "", HpcJobStatus.COMPLETE)

        logger.debug("status=%s", job_info)
        return job_info
Esempio n. 3
0
    def check_status(self, name=None, job_id=None):
        field_names = ("jobid", "name", "state")
        cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h"
        if name is not None:
            cmd += f" -n {name}"
        elif job_id is not None:
            cmd += f" -j {job_id}"
        else:
            # Mutual exclusivity should be handled in HpcManager.
            assert False

        output = {}
        # Transient failures could be costly. Retry for up to one minute.
        errors = ["Invalid job id specified"]
        ret = run_command(cmd,
                          output,
                          num_retries=6,
                          retry_delay_s=10,
                          error_strings=errors)
        if ret != 0:
            if "Invalid job id specified" in output["stderr"]:
                return HpcJobInfo("", "", HpcJobStatus.NONE)

            logger.error("Failed to run squeue command=[%s] ret=%s err=%s",
                         cmd, ret, output["stderr"])
            raise ExecutionError(f"squeue command failed: {ret}")

        stdout = output["stdout"]
        logger.debug("squeue output:  [%s]", stdout)
        fields = stdout.split()
        if not fields:
            # No jobs are currently running.
            return HpcJobInfo("", "", HpcJobStatus.NONE)

        assert len(fields) == len(field_names)
        job_info = HpcJobInfo(
            fields[0], fields[1],
            self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN))
        return job_info
Esempio n. 4
0
 def check_status(self, name=None, job_id=None):
     return HpcJobInfo("", "", HpcJobStatus.NONE)