def _read_aux_files(self, job: Job, status: JobStatus) -> None: try: if logger.isEnabledFor(logging.DEBUG): launcher_log = self._read_aux_file(path=self.work_directory / (job.id + '_launcher.log')) if launcher_log is not None: logger.debug('Job %s: launcher log: %s', job.id, launcher_log) if status.state == JobState.CANCELED: # exit code and other things are not very meaningful for canceled jobs return # read exit code and output files exit_code_str = self._read_aux_file(job, '.ec') if exit_code_str: status.exit_code = int(exit_code_str) if status.exit_code != 0: status.state = JobState.FAILED if status.state == JobState.FAILED: if status.message is None: # only read output from submit script if another error message is not # already present status.message = self._read_aux_file(job, '.out') else: self._delete_aux_file(job, '.out') except Exception as ex: logger.warning('Job %s: failed to read auxiliary files: %s', job.id, ex)
def _done_cb(self, job: Job, fut: flux.job.FluxExecutorFuture) -> None: """Callback triggered when Flux job completes. Fetch returncode or exception message and update the psij.Job. """ try: returncode = fut.result() except concurrent.futures.CancelledError: status = JobStatus(JobState.CANCELED, time=time.time()) except Exception as exc: if "type=cancel" in str(exc): state = JobState.CANCELED else: state = JobState.FAILED status = JobStatus(state, time=time.time(), message=str(exc)) else: if returncode == 0: status = JobStatus(JobState.COMPLETED, time=time.time()) else: status = JobStatus(JobState.FAILED, time=time.time(), exit_code=returncode) self._set_job_status(job, status) # remove future from cache del self._futures[job]
def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.""" check_status_exit_code(_QSTAT_COMMAND, exit_code, out) r = {} report = json.loads(out) jobs = report['Jobs'] for native_id in jobs: job_report = jobs[native_id] native_state = job_report["job_state"] state = self._get_state(native_state) if state == JobState.COMPLETED: if 'Exit_status' in job_report and job_report[ 'Exit_status'] == 265: state = JobState.CANCELED elif 'Exit_status' in job_report and job_report[ 'Exit_status'] != 0: state = JobState.FAILED msg = job_report["comment"] r[native_id] = JobStatus(state, message=msg) return r
def submit(self, job: Job) -> None: """See :func:`~psij.JobExecutor.submit`.""" logger.info('Job %s: submitting', job.id) self._ensure_work_dir() assert (job.spec) job.executor = self context = self._create_script_context(job) # assumes job ids are unique submit_file_path = self.work_directory / (job.id + '.job') with submit_file_path.open('w') as submit_file: self.generate_submit_script(job, context, submit_file) try: logger.debug('Job %s: running submit command', job.id) out = self._run_command( self.get_submit_command(job, submit_file_path)) logger.debug('Job %s: submit command ouput: %s', job.id, out) job._native_id = self.job_id_from_submit_output(out) logger.info('Job %s: native id: %s', job.id, job.native_id) self._set_job_status( job, JobStatus(JobState.QUEUED, metadata={'native_id': job.native_id})) except subprocess.CalledProcessError as ex: raise SubmitException(ex.output) from None self._queue_poll_thread.register_job(job)
def _task_state_cb(self, task: _rp.Task, rp_state: str) -> None: jpsi_uid = task.name jpsi_job = self._tasks[jpsi_uid][0] ec = None if task.state in self._rp.FINAL: ec = task.exit_code old_state = jpsi_job.status.state new_state = self._state_map.get(task.state) logger.debug('%s --> %s - %s', jpsi_uid, task.state, new_state) if new_state is None: # not an interesting state transition, ignore return if old_state == new_state: return metadata = {'nativeId': task.uid} if ec: metadata['exit_code'] = ec if task.state in self._rp.FINAL: metadata['final'] = True status = JobStatus(new_state, time=time.time(), metadata=metadata) self._set_job_status(jpsi_job, status)
def cancel(self, job: Job) -> None: """ Cancels a job. :param job: The job to cancel. """ self._set_job_status(job, JobStatus(JobState.CANCELED)) self._reaper.cancel(job)
def _jobid_cb(self, job: Job, fut: flux.job.FluxExecutorFuture) -> None: """Callback triggered when Flux jobid is ready. Fetch the jobid, set it on the psij.Job, and set the the job to QUEUED. """ job._native_id = fut.jobid() job_status = JobStatus(JobState.QUEUED, time=time.time()) self._set_job_status(job, job_status)
def submit(self, job: Job) -> None: """ Submits the specified :class:`~psij.Job` to be run locally. Successful return of this method indicates that the job has been started locally and all changes in the job status, including failures, are reported using notifications. If the job specification is invalid, an :class:`~psij.InvalidJobException` is thrown. If the actual submission fails for reasons outside the validity of the job, a :class:`~psij.SubmitException` is thrown. :param job: The job to be submitted. """ spec = job.spec if not spec: raise InvalidJobException('Missing specification') job.executor = self p = _ChildProcessEntry( job, self, self._get_launcher(self._get_launcher_name(spec))) assert p.launcher args = p.launcher.get_launch_command(job) try: with job._status_cv: if job.status.state == JobState.CANCELED: raise SubmitException('Job canceled') logger.debug('Running %s, out=%s, err=%s', args, spec.stdout_path, spec.stderr_path) p.process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, cwd=spec.directory, env=_get_env(spec)) self._reaper.register(p) job._native_id = p.process.pid self._set_job_status( job, JobStatus(JobState.QUEUED, time=time.time(), metadata={'nativeId': job._native_id})) self._set_job_status(job, JobStatus(JobState.ACTIVE, time=time.time())) except Exception as ex: raise SubmitException('Failed to submit job', exception=ex)
def _event_cb(self, job: Job, fut: flux.job.FluxExecutorFuture, evt: Any) -> None: """Callback triggered when Flux job logs an event. Update the status of the psij.Job. """ jpsi_state = self._event_map[evt.name] metadata = copy.deepcopy(evt.context) job_status = JobStatus(jpsi_state, time=time.time(), metadata=metadata) self._set_job_status(job, job_status)
def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: check_status_exit_code(QSTAT_PATH, exit_code, out) r = {} lines = iter(out.split('\n')) for line in lines: if not line: continue cols = line.split(maxsplit=2) native_id = cols[0] state = self._get_state(cols[1]) msg = cols[2] if len(cols) == 3 else None r[native_id] = JobStatus(state, message=msg) return r
def attach(self, job: Job, native_id: str) -> None: """ Attaches a job to a process. The job must be in the :attr:`~psij.JobState.NEW` state. The exit code of the attached job will not be available upon completion and a zero exit code will always be returned for jobs attached by the `LocalJobExecutor`. :param job: The job to attach. :param native_id: The native ID of the process to attached to, as obtained through :func:`~psij.executors.LocalJobExecutor.list` method. """ if job.status.state != JobState.NEW: raise InvalidJobException('Job must be in the NEW state') job.executor = self pid = int(native_id) self._reaper.register( _AttachedProcessEntry(job, psutil.Process(pid), self)) # We assume that the native_id above is a PID that was obtained at some point using # list(). If so, the process is either still running or has completed. Either way, we must # bring it up to ACTIVE state self._set_job_status(job, JobStatus(JobState.QUEUED, time=time.time())) self._set_job_status(job, JobStatus(JobState.ACTIVE, time=time.time()))
def _poll(self) -> None: with self._jobs_lock: if len(self._jobs) == 0: return jobs_copy = dict(self._jobs) logger.info('Polling for %s jobs', len(jobs_copy)) try: out = self.executor._run_command( self.executor.get_status_command(jobs_copy.keys())) except subprocess.CalledProcessError as ex: out = ex.output exit_code = ex.returncode except Exception as ex: self._handle_poll_error( True, ex, f'Failed to poll for job status: {traceback.format_exc()}') return else: exit_code = 0 self._poll_error_count = 0 logger.debug('Output from status command: %s', out) try: status_map = self.executor.parse_status_output(exit_code, out) except Exception as ex: self._handle_poll_error( False, ex, f'Failed to poll for job status: {traceback.format_exc()}') return try: for native_id, job_list in jobs_copy.items(): try: status = self._get_job_status(native_id, status_map) except Exception: status = JobStatus( JobState.FAILED, message='Failed to update job status: %s' % traceback.format_exc()) for job in job_list: self.executor._set_job_status(job, status) if status.state.final: with self._jobs_lock: del self._jobs[native_id] except Exception as ex: msg = traceback.format_exc() self._handle_poll_error( True, ex, 'Error updating job statuses {}'.format(msg))
def cancel(self, job: Job) -> None: """ Cancels a job. :param job: The job to cancel. """ with job._status_cv: if job.status.state == JobState.NEW: self._set_job_status(job, JobStatus(JobState.CANCELED)) return if job.id not in self._tasks: raise ValueError('job not known') _, task = self._tasks[job.id] self._tmgr.cancel_tasks(uids=task.uid)
def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.""" check_status_exit_code(_SQUEUE_COMMAND, exit_code, out) r = {} lines = iter(out.split('\n')) # skip header lines.__next__() for line in lines: if not line: continue cols = line.split() assert len(cols) == 3 native_id = cols[0] state = self._get_state(cols[1]) msg = self._get_message(cols[2]) if state == JobState.FAILED else None r[native_id] = JobStatus(state, message=msg) return r
def attach(self, job: Job, native_id: str) -> None: """ Attaches a job to a process. The job must be in the :attr:`~psij.job_state.JobState.NEW` state. :param job: The job to attach. :param native_id: The native ID of the process to attached to, as obtained through :func:`~list` method. """ if job.status.state != JobState.NEW: raise InvalidJobException('Job must be in the NEW state') job.executor = self task = self._tmgr.get_tasks(uids=[native_id])[0] self._tasks[job.id] = (job, task) state = self._state_map[task.state] self._set_job_status(job, JobStatus(state, time=time.time()))
def _handle_poll_error(self, immediate: bool, ex: Exception, msg: str) -> None: logger.warning('Polling error: %s', msg) self._poll_error_count += 1 if immediate or (self._poll_error_count > self.config.queue_polling_error_threshold): self._poll_error_count = 0 # fail all jobs with self._jobs_lock: # We should only poll if there is at least one job, so we should not be in a # situation when we polled and there were no jobs to poll for # Internal errors are a bit different, since they could, in principle, occur # after the last job was processed and removed from self._jobs; in practice, # the code in _poll has the job removal from _jobs as the last possible step assert len(self._jobs) > 0 jobs_copy = dict(self._jobs) self._jobs.clear() for job_list in jobs_copy.values(): for job in job_list: self.executor._set_job_status( job, JobStatus(JobState.FAILED, message=msg))
def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`. Iterate through the RECORDS entry, grabbing JOBID and STAT entries, as well as any state-change reasons if present. """ check_status_exit_code(_BJOBS_COMMAND, exit_code, out) output = json.loads(out) status_map = {} for entry in output["RECORDS"]: if "ERROR" in entry: continue state = self._STATE_MAP[entry["STAT"]] message = None for reason in ("EXIT_REASON", "KILL_REASON", "SUSPEND_REASON"): if entry[reason]: message = entry["reason"] break status_map[entry["JOBID"]] = JobStatus(state, message=message) return status_map
def _process_done(self, p: _ProcessEntry) -> None: assert p.exit_code is not None message = None if p.exit_code == 0: state = JobState.COMPLETED elif p.exit_code < 0 and p.kill_flag: state = JobState.CANCELED else: # We want to capture errors in the launcher scripts. Since, under normal circumstances, # the exit code of the launcher is the exit code of the job, we must use a different # mechanism to distinguish between job errors and launcher errors. So we delegate to # the launcher implementation to figure out if the error belongs to the job or not if p.launcher and p.out and p.launcher.is_launcher_failure(p.out): message = p.launcher.get_launcher_failure_message(p.out) state = JobState.FAILED self._set_job_status( p.job, JobStatus(state, time=p.done_time, exit_code=p.exit_code, message=message))
def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]: """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.""" # if none of the job ID passed to Cobalt are recognized, qstat returns 1, # but we shouldn't treat that as an error if exit_code != 0 and out == UNKNOWN_ERROR: return {} check_status_exit_code(_QSTAT_COMMAND, exit_code, out) job_statuses = {} index = 0 lines = out.split("\n") while index < len(lines) - 1: jobid_match = _QSTAT_JOBID_REGEX.search(lines[index]) if jobid_match is not None: state_match = _QSTAT_STATE_REGEX.search(lines[index + 1]) if state_match is not None: job_statuses[jobid_match.group(2)] = JobStatus( self._STATE_MAP[state_match.group(2)]) index += 2 else: index += 1 else: index += 1 return job_statuses
def _get_job_status(self, native_id: str, status_map: Dict[str, JobStatus]) -> JobStatus: if native_id in status_map: return status_map[native_id] else: return JobStatus(JobState.COMPLETED)