def check_status(self, name=None, job_id=None): field_names = ("jobid", "name", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" if name is not None: cmd += f" -n {name}" elif job_id is not None: cmd += f" -j {job_id}" else: # Mutual exclusivity should be handled in HpcManager. assert False output = {} ret = run_command(cmd, output) if ret != 0: logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") stdout = output["stdout"] logger.debug("squeue output: [%s]", stdout) fields = stdout.split() if not fields: # No jobs are currently running. return HpcJobInfo("", "", HpcJobStatus.NONE) assert len(fields) == len(field_names) job_info = HpcJobInfo(fields[0], fields[1], self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)) return job_info
def check_status(self, name=None, job_id=None): if self._subprocess_mgr is None: job_info = HpcJobInfo(job_id, "", HpcJobStatus.NONE) elif self._subprocess_mgr.in_progress(): job_info = HpcJobInfo(job_id, "", HpcJobStatus.RUNNING) else: job_info = HpcJobInfo(job_id, "", HpcJobStatus.COMPLETE) logger.debug("status=%s", job_info) return job_info
def check_status(self, name=None, job_id=None): field_names = ("jobid", "name", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" if name is not None: cmd += f" -n {name}" elif job_id is not None: cmd += f" -j {job_id}" else: # Mutual exclusivity should be handled in HpcManager. assert False output = {} # Transient failures could be costly. Retry for up to one minute. errors = ["Invalid job id specified"] ret = run_command(cmd, output, num_retries=6, retry_delay_s=10, error_strings=errors) if ret != 0: if "Invalid job id specified" in output["stderr"]: return HpcJobInfo("", "", HpcJobStatus.NONE) logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") stdout = output["stdout"] logger.debug("squeue output: [%s]", stdout) fields = stdout.split() if not fields: # No jobs are currently running. return HpcJobInfo("", "", HpcJobStatus.NONE) assert len(fields) == len(field_names) job_info = HpcJobInfo( fields[0], fields[1], self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)) return job_info
def check_status(self, name=None, job_id=None): return HpcJobInfo("", "", HpcJobStatus.NONE)