Exemple #1
0
    def _read_aux_files(self, job: Job, status: JobStatus) -> None:
        try:
            if logger.isEnabledFor(logging.DEBUG):
                launcher_log = self._read_aux_file(path=self.work_directory /
                                                   (job.id + '_launcher.log'))
                if launcher_log is not None:
                    logger.debug('Job %s: launcher log: %s', job.id,
                                 launcher_log)
            if status.state == JobState.CANCELED:
                # exit code and other things are not very meaningful for canceled jobs
                return
            # read exit code and output files
            exit_code_str = self._read_aux_file(job, '.ec')
            if exit_code_str:
                status.exit_code = int(exit_code_str)
                if status.exit_code != 0:
                    status.state = JobState.FAILED
            if status.state == JobState.FAILED:

                if status.message is None:
                    # only read output from submit script if another error message is not
                    # already present
                    status.message = self._read_aux_file(job, '.out')
                else:
                    self._delete_aux_file(job, '.out')

        except Exception as ex:
            logger.warning('Job %s: failed to read auxiliary files: %s',
                           job.id, ex)
Exemple #2
0
    def _done_cb(self, job: Job, fut: flux.job.FluxExecutorFuture) -> None:
        """Callback triggered when Flux job completes.

        Fetch returncode or exception message and update the psij.Job.
        """
        try:
            returncode = fut.result()
        except concurrent.futures.CancelledError:
            status = JobStatus(JobState.CANCELED, time=time.time())
        except Exception as exc:
            if "type=cancel" in str(exc):
                state = JobState.CANCELED
            else:
                state = JobState.FAILED
            status = JobStatus(state, time=time.time(), message=str(exc))
        else:
            if returncode == 0:
                status = JobStatus(JobState.COMPLETED, time=time.time())
            else:
                status = JobStatus(JobState.FAILED,
                                   time=time.time(),
                                   exit_code=returncode)
        self._set_job_status(job, status)
        # remove future from cache
        del self._futures[job]
Exemple #3
0
    def parse_status_output(self, exit_code: int,
                            out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`."""
        check_status_exit_code(_QSTAT_COMMAND, exit_code, out)
        r = {}

        report = json.loads(out)
        jobs = report['Jobs']
        for native_id in jobs:
            job_report = jobs[native_id]
            native_state = job_report["job_state"]
            state = self._get_state(native_state)

            if state == JobState.COMPLETED:
                if 'Exit_status' in job_report and job_report[
                        'Exit_status'] == 265:
                    state = JobState.CANCELED
                elif 'Exit_status' in job_report and job_report[
                        'Exit_status'] != 0:
                    state = JobState.FAILED

            msg = job_report["comment"]
            r[native_id] = JobStatus(state, message=msg)

        return r
Exemple #4
0
    def submit(self, job: Job) -> None:
        """See :func:`~psij.JobExecutor.submit`."""
        logger.info('Job %s: submitting', job.id)
        self._ensure_work_dir()
        assert (job.spec)

        job.executor = self
        context = self._create_script_context(job)

        # assumes job ids are unique
        submit_file_path = self.work_directory / (job.id + '.job')
        with submit_file_path.open('w') as submit_file:
            self.generate_submit_script(job, context, submit_file)
        try:
            logger.debug('Job %s: running submit command', job.id)
            out = self._run_command(
                self.get_submit_command(job, submit_file_path))
            logger.debug('Job %s: submit command ouput: %s', job.id, out)
            job._native_id = self.job_id_from_submit_output(out)
            logger.info('Job %s: native id: %s', job.id, job.native_id)
            self._set_job_status(
                job,
                JobStatus(JobState.QUEUED,
                          metadata={'native_id': job.native_id}))
        except subprocess.CalledProcessError as ex:
            raise SubmitException(ex.output) from None

        self._queue_poll_thread.register_job(job)
Exemple #5
0
    def _task_state_cb(self, task: _rp.Task, rp_state: str) -> None:

        jpsi_uid = task.name
        jpsi_job = self._tasks[jpsi_uid][0]

        ec = None
        if task.state in self._rp.FINAL:
            ec = task.exit_code

        old_state = jpsi_job.status.state
        new_state = self._state_map.get(task.state)

        logger.debug('%s --> %s - %s', jpsi_uid, task.state, new_state)

        if new_state is None:
            # not an interesting state transition, ignore
            return

        if old_state == new_state:
            return

        metadata = {'nativeId': task.uid}

        if ec:
            metadata['exit_code'] = ec

        if task.state in self._rp.FINAL:
            metadata['final'] = True

        status = JobStatus(new_state, time=time.time(), metadata=metadata)
        self._set_job_status(jpsi_job, status)
Exemple #6
0
    def cancel(self, job: Job) -> None:
        """
        Cancels a job.

        :param job: The job to cancel.
        """
        self._set_job_status(job, JobStatus(JobState.CANCELED))
        self._reaper.cancel(job)
Exemple #7
0
    def _jobid_cb(self, job: Job, fut: flux.job.FluxExecutorFuture) -> None:
        """Callback triggered when Flux jobid is ready.

        Fetch the jobid, set it on the psij.Job, and set the the job to QUEUED.
        """
        job._native_id = fut.jobid()
        job_status = JobStatus(JobState.QUEUED, time=time.time())
        self._set_job_status(job, job_status)
Exemple #8
0
    def submit(self, job: Job) -> None:
        """
        Submits the specified :class:`~psij.Job` to be run locally.

        Successful return of this method indicates that the job has been started locally and all
        changes in the job status, including failures, are reported using notifications. If the job
        specification is invalid, an :class:`~psij.InvalidJobException` is thrown. If
        the actual submission fails for reasons outside the validity of the job,
        a :class:`~psij.SubmitException` is thrown.

        :param job: The job to be submitted.
        """
        spec = job.spec
        if not spec:
            raise InvalidJobException('Missing specification')
        job.executor = self

        p = _ChildProcessEntry(
            job, self, self._get_launcher(self._get_launcher_name(spec)))
        assert p.launcher
        args = p.launcher.get_launch_command(job)

        try:
            with job._status_cv:
                if job.status.state == JobState.CANCELED:
                    raise SubmitException('Job canceled')
            logger.debug('Running %s,  out=%s, err=%s', args, spec.stdout_path,
                         spec.stderr_path)
            p.process = subprocess.Popen(args,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT,
                                         close_fds=True,
                                         cwd=spec.directory,
                                         env=_get_env(spec))
            self._reaper.register(p)
            job._native_id = p.process.pid
            self._set_job_status(
                job,
                JobStatus(JobState.QUEUED,
                          time=time.time(),
                          metadata={'nativeId': job._native_id}))
            self._set_job_status(job,
                                 JobStatus(JobState.ACTIVE, time=time.time()))
        except Exception as ex:
            raise SubmitException('Failed to submit job', exception=ex)
Exemple #9
0
    def _event_cb(self, job: Job, fut: flux.job.FluxExecutorFuture,
                  evt: Any) -> None:
        """Callback triggered when Flux job logs an event.

        Update the status of the psij.Job.
        """
        jpsi_state = self._event_map[evt.name]
        metadata = copy.deepcopy(evt.context)
        job_status = JobStatus(jpsi_state, time=time.time(), metadata=metadata)
        self._set_job_status(job, job_status)
Exemple #10
0
 def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]:
     check_status_exit_code(QSTAT_PATH, exit_code, out)
     r = {}
     lines = iter(out.split('\n'))
     for line in lines:
         if not line:
             continue
         cols = line.split(maxsplit=2)
         native_id = cols[0]
         state = self._get_state(cols[1])
         msg = cols[2] if len(cols) == 3 else None
         r[native_id] = JobStatus(state, message=msg)
     return r
Exemple #11
0
    def attach(self, job: Job, native_id: str) -> None:
        """
        Attaches a job to a process.

        The job must be in the :attr:`~psij.JobState.NEW` state. The exit code of the attached job
        will not be available upon completion and a zero exit code will always be returned for jobs
        attached by the `LocalJobExecutor`.

        :param job: The job to attach.
        :param native_id: The native ID of the process to attached to, as obtained through
            :func:`~psij.executors.LocalJobExecutor.list` method.
        """
        if job.status.state != JobState.NEW:
            raise InvalidJobException('Job must be in the NEW state')
        job.executor = self
        pid = int(native_id)

        self._reaper.register(
            _AttachedProcessEntry(job, psutil.Process(pid), self))
        # We assume that the native_id above is a PID that was obtained at some point using
        # list(). If so, the process is either still running or has completed. Either way, we must
        # bring it up to ACTIVE state
        self._set_job_status(job, JobStatus(JobState.QUEUED, time=time.time()))
        self._set_job_status(job, JobStatus(JobState.ACTIVE, time=time.time()))
Exemple #12
0
 def _poll(self) -> None:
     with self._jobs_lock:
         if len(self._jobs) == 0:
             return
         jobs_copy = dict(self._jobs)
     logger.info('Polling for %s jobs', len(jobs_copy))
     try:
         out = self.executor._run_command(
             self.executor.get_status_command(jobs_copy.keys()))
     except subprocess.CalledProcessError as ex:
         out = ex.output
         exit_code = ex.returncode
     except Exception as ex:
         self._handle_poll_error(
             True, ex,
             f'Failed to poll for job status: {traceback.format_exc()}')
         return
     else:
         exit_code = 0
         self._poll_error_count = 0
     logger.debug('Output from status command: %s', out)
     try:
         status_map = self.executor.parse_status_output(exit_code, out)
     except Exception as ex:
         self._handle_poll_error(
             False, ex,
             f'Failed to poll for job status: {traceback.format_exc()}')
         return
     try:
         for native_id, job_list in jobs_copy.items():
             try:
                 status = self._get_job_status(native_id, status_map)
             except Exception:
                 status = JobStatus(
                     JobState.FAILED,
                     message='Failed to update job status: %s' %
                     traceback.format_exc())
             for job in job_list:
                 self.executor._set_job_status(job, status)
             if status.state.final:
                 with self._jobs_lock:
                     del self._jobs[native_id]
     except Exception as ex:
         msg = traceback.format_exc()
         self._handle_poll_error(
             True, ex, 'Error updating job statuses {}'.format(msg))
Exemple #13
0
    def cancel(self, job: Job) -> None:
        """
        Cancels a job.

        :param job: The job to cancel.
        """
        with job._status_cv:
            if job.status.state == JobState.NEW:
                self._set_job_status(job, JobStatus(JobState.CANCELED))
                return

        if job.id not in self._tasks:
            raise ValueError('job not known')

        _, task = self._tasks[job.id]

        self._tmgr.cancel_tasks(uids=task.uid)
Exemple #14
0
    def parse_status_output(self, exit_code: int, out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`."""
        check_status_exit_code(_SQUEUE_COMMAND, exit_code, out)
        r = {}
        lines = iter(out.split('\n'))
        # skip header
        lines.__next__()
        for line in lines:
            if not line:
                continue
            cols = line.split()
            assert len(cols) == 3
            native_id = cols[0]
            state = self._get_state(cols[1])
            msg = self._get_message(cols[2]) if state == JobState.FAILED else None
            r[native_id] = JobStatus(state, message=msg)

        return r
Exemple #15
0
    def attach(self, job: Job, native_id: str) -> None:
        """
        Attaches a job to a process.

        The job must be in the :attr:`~psij.job_state.JobState.NEW` state.

        :param job: The job to attach.
        :param native_id: The native ID of the process to attached to, as
            obtained through :func:`~list` method.
        """
        if job.status.state != JobState.NEW:
            raise InvalidJobException('Job must be in the NEW state')

        job.executor = self

        task = self._tmgr.get_tasks(uids=[native_id])[0]
        self._tasks[job.id] = (job, task)

        state = self._state_map[task.state]
        self._set_job_status(job, JobStatus(state, time=time.time()))
Exemple #16
0
 def _handle_poll_error(self, immediate: bool, ex: Exception,
                        msg: str) -> None:
     logger.warning('Polling error: %s', msg)
     self._poll_error_count += 1
     if immediate or (self._poll_error_count >
                      self.config.queue_polling_error_threshold):
         self._poll_error_count = 0
         # fail all jobs
         with self._jobs_lock:
             # We should only poll if there is at least one job, so we should not be in a
             # situation when we polled and there were no jobs to poll for
             # Internal errors are a bit different, since they could, in principle, occur
             # after the last job was processed and removed from self._jobs; in practice,
             # the code in _poll has the job removal from _jobs as the last possible step
             assert len(self._jobs) > 0
             jobs_copy = dict(self._jobs)
             self._jobs.clear()
         for job_list in jobs_copy.values():
             for job in job_list:
                 self.executor._set_job_status(
                     job, JobStatus(JobState.FAILED, message=msg))
Exemple #17
0
    def parse_status_output(self, exit_code: int,
                            out: str) -> Dict[str, JobStatus]:
        """See :meth:`~.BatchSchedulerExecutor.parse_status_output`.

        Iterate through the RECORDS entry, grabbing JOBID and STAT entries, as well
        as any state-change reasons if present.
        """
        check_status_exit_code(_BJOBS_COMMAND, exit_code, out)
        output = json.loads(out)
        status_map = {}
        for entry in output["RECORDS"]:
            if "ERROR" in entry:
                continue
            state = self._STATE_MAP[entry["STAT"]]
            message = None
            for reason in ("EXIT_REASON", "KILL_REASON", "SUSPEND_REASON"):
                if entry[reason]:
                    message = entry["reason"]
                    break
            status_map[entry["JOBID"]] = JobStatus(state, message=message)
        return status_map
Exemple #18
0
    def _process_done(self, p: _ProcessEntry) -> None:
        assert p.exit_code is not None
        message = None
        if p.exit_code == 0:
            state = JobState.COMPLETED
        elif p.exit_code < 0 and p.kill_flag:
            state = JobState.CANCELED
        else:
            # We want to capture errors in the launcher scripts. Since, under normal circumstances,
            # the exit code of the launcher is the exit code of the job, we must use a different
            # mechanism to distinguish between job errors and launcher errors. So we delegate to
            # the launcher implementation to figure out if the error belongs to the job or not
            if p.launcher and p.out and p.launcher.is_launcher_failure(p.out):
                message = p.launcher.get_launcher_failure_message(p.out)
            state = JobState.FAILED

        self._set_job_status(
            p.job,
            JobStatus(state,
                      time=p.done_time,
                      exit_code=p.exit_code,
                      message=message))
Exemple #19
0
 def parse_status_output(self, exit_code: int,
                         out: str) -> Dict[str, JobStatus]:
     """See :meth:`~.BatchSchedulerExecutor.parse_status_output`."""
     # if none of the job ID passed to Cobalt are recognized, qstat returns 1,
     # but we shouldn't treat that as an error
     if exit_code != 0 and out == UNKNOWN_ERROR:
         return {}
     check_status_exit_code(_QSTAT_COMMAND, exit_code, out)
     job_statuses = {}
     index = 0
     lines = out.split("\n")
     while index < len(lines) - 1:
         jobid_match = _QSTAT_JOBID_REGEX.search(lines[index])
         if jobid_match is not None:
             state_match = _QSTAT_STATE_REGEX.search(lines[index + 1])
             if state_match is not None:
                 job_statuses[jobid_match.group(2)] = JobStatus(
                     self._STATE_MAP[state_match.group(2)])
                 index += 2
             else:
                 index += 1
         else:
             index += 1
     return job_statuses
Exemple #20
0
 def _get_job_status(self, native_id: str,
                     status_map: Dict[str, JobStatus]) -> JobStatus:
     if native_id in status_map:
         return status_map[native_id]
     else:
         return JobStatus(JobState.COMPLETED)