def submit(self, job: Job) -> None: """See :func:`~psij.JobExecutor.submit`.""" logger.info('Job %s: submitting', job.id) self._ensure_work_dir() assert (job.spec) job.executor = self context = self._create_script_context(job) # assumes job ids are unique submit_file_path = self.work_directory / (job.id + '.job') with submit_file_path.open('w') as submit_file: self.generate_submit_script(job, context, submit_file) try: logger.debug('Job %s: running submit command', job.id) out = self._run_command( self.get_submit_command(job, submit_file_path)) logger.debug('Job %s: submit command ouput: %s', job.id, out) job._native_id = self.job_id_from_submit_output(out) logger.info('Job %s: native id: %s', job.id, job.native_id) self._set_job_status( job, JobStatus(JobState.QUEUED, metadata={'native_id': job.native_id})) except subprocess.CalledProcessError as ex: raise SubmitException(ex.output) from None self._queue_poll_thread.register_job(job)
def submit(self, job: Job) -> None: """ Submits the specified :class:`~psij.Job` to the pilot. Successful return of this method indicates that the job has been submitted to RP and all changes in the job status, including failures, are reported using notifications. If the job specification is invalid, an :class:`~psij.InvalidJobException` is thrown. If the actual submission fails for reasons outside the validity of the job, a :class:`~psij.SubmitException` is thrown. :param job: The job to be submitted. """ spec = job.spec if not spec: raise InvalidJobException('Missing specification') job.executor = self try: td = self._job_2_descr(job) task = self._tmgr.submit_tasks(td) self._tasks[job.id] = (job, task) except Exception as ex: raise SubmitException('Failed to submit job') from ex
def attach(self, job: Job, native_id: str) -> None: """ Attaches a job to a process. The job must be in the :attr:`~psij.JobState.NEW` state. :param job: The job to attach. :param native_id: The native ID of the process to attached to, as obtained through :func:`~psij.executors.flux.FluxJobExecutor.list` method. """ job.executor = self self._add_flux_callbacks(job, self._flux_executor.attach(native_id))
def submit(self, job: Job) -> None: """ Submits the specified :class:`~psij.Job` to be run locally. Successful return of this method indicates that the job has been started locally and all changes in the job status, including failures, are reported using notifications. If the job specification is invalid, an :class:`~psij.InvalidJobException` is thrown. If the actual submission fails for reasons outside the validity of the job, a :class:`~psij.SubmitException` is thrown. :param job: The job to be submitted. """ spec = job.spec if not spec: raise InvalidJobException('Missing specification') job.executor = self p = _ChildProcessEntry( job, self, self._get_launcher(self._get_launcher_name(spec))) assert p.launcher args = p.launcher.get_launch_command(job) try: with job._status_cv: if job.status.state == JobState.CANCELED: raise SubmitException('Job canceled') logger.debug('Running %s, out=%s, err=%s', args, spec.stdout_path, spec.stderr_path) p.process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, cwd=spec.directory, env=_get_env(spec)) self._reaper.register(p) job._native_id = p.process.pid self._set_job_status( job, JobStatus(JobState.QUEUED, time=time.time(), metadata={'nativeId': job._native_id})) self._set_job_status(job, JobStatus(JobState.ACTIVE, time=time.time())) except Exception as ex: raise SubmitException('Failed to submit job', exception=ex)
def submit(self, job: Job) -> None: """See :func:`~psij.job_executor.JobExecutor.submit`.""" assert job.spec assert job.spec.attributes job.executor = self if isinstance(job.spec.resources, ResourceSpecV1): resources = job.spec.resources elif isinstance(job.spec.resources, ResourceSpec): raise InvalidJobException( f"ResourceSpec version {job.spec.resources.version} not supported" ) else: resources = ResourceSpecV1(process_count=1, cpu_cores_per_process=1) if resources.exclusive_node_use: warnings.warn( "Flux does not support exclusive_node_use=True, ignoring...", UserWarning, ) if resources.processes_per_node: raise InvalidJobException( "Flux does not support processes_per_node") if not job.spec.executable: raise InvalidJobException("Job must have an executable") argv = list(job.spec.arguments) if job.spec.arguments else [] argv.insert(0, job.spec.executable) flux_jobspec = flux.job.JobspecV1.from_command( argv, num_tasks=resources.process_count, cores_per_task=resources.cpu_cores_per_process, gpus_per_task=resources.gpu_cores_per_process, num_nodes=resources.node_count, ) if job.spec.stdout_path: flux_jobspec.stdout = job.spec.stdout_path if job.spec.stdin_path: flux_jobspec.stdin = job.spec.stdin_path if job.spec.stderr_path: flux.jobspec.stderr = job.spec.stderr_path flux_jobspec.duration = job.spec.attributes.duration.total_seconds() fut = self._flux_executor.submit(flux_jobspec) self._add_flux_callbacks(job, fut)
def attach(self, job: Job, native_id: str) -> None: """Attaches a job to a native job. Attempts to connect `job` to a native job with `native_id` such that the job correctly reflects updates to the status of the native job. If the native job was previously submitted using this executor (hence having an *exit code file* and a *script output file*), the executor will attempt to retrieve the exit code and errors from the job. Otherwise, it may be impossible for the executor to distinguish between a failed and successfully completed job. Parameters ---------- job The PSI/J job to attach. native_id The id of the batch scheduler job to attach to. """ job._native_id = native_id job.executor = self self._queue_poll_thread.register_job(job)
def attach(self, job: Job, native_id: str) -> None: """ Attaches a job to a process. The job must be in the :attr:`~psij.job_state.JobState.NEW` state. :param job: The job to attach. :param native_id: The native ID of the process to attached to, as obtained through :func:`~list` method. """ if job.status.state != JobState.NEW: raise InvalidJobException('Job must be in the NEW state') job.executor = self task = self._tmgr.get_tasks(uids=[native_id])[0] self._tasks[job.id] = (job, task) state = self._state_map[task.state] self._set_job_status(job, JobStatus(state, time=time.time()))
def attach(self, job: Job, native_id: str) -> None: """ Attaches a job to a process. The job must be in the :attr:`~psij.JobState.NEW` state. The exit code of the attached job will not be available upon completion and a zero exit code will always be returned for jobs attached by the `LocalJobExecutor`. :param job: The job to attach. :param native_id: The native ID of the process to attached to, as obtained through :func:`~psij.executors.LocalJobExecutor.list` method. """ if job.status.state != JobState.NEW: raise InvalidJobException('Job must be in the NEW state') job.executor = self pid = int(native_id) self._reaper.register( _AttachedProcessEntry(job, psutil.Process(pid), self)) # We assume that the native_id above is a PID that was obtained at some point using # list(). If so, the process is either still running or has completed. Either way, we must # bring it up to ACTIVE state self._set_job_status(job, JobStatus(JobState.QUEUED, time=time.time())) self._set_job_status(job, JobStatus(JobState.ACTIVE, time=time.time()))