Ejemplo n.º 1
0
    def submit(self, job: Job) -> None:
        """See :func:`~psij.JobExecutor.submit`."""
        logger.info('Job %s: submitting', job.id)
        self._ensure_work_dir()
        assert (job.spec)

        job.executor = self
        context = self._create_script_context(job)

        # assumes job ids are unique
        submit_file_path = self.work_directory / (job.id + '.job')
        with submit_file_path.open('w') as submit_file:
            self.generate_submit_script(job, context, submit_file)
        try:
            logger.debug('Job %s: running submit command', job.id)
            out = self._run_command(
                self.get_submit_command(job, submit_file_path))
            logger.debug('Job %s: submit command ouput: %s', job.id, out)
            job._native_id = self.job_id_from_submit_output(out)
            logger.info('Job %s: native id: %s', job.id, job.native_id)
            self._set_job_status(
                job,
                JobStatus(JobState.QUEUED,
                          metadata={'native_id': job.native_id}))
        except subprocess.CalledProcessError as ex:
            raise SubmitException(ex.output) from None

        self._queue_poll_thread.register_job(job)
Ejemplo n.º 2
0
def test_failing_job(execparams: ExecutorTestParams) -> None:
    job = Job(JobSpec(executable='/bin/false'))
    ex = _get_executor_instance(execparams, job)
    ex.submit(job)
    status = job.wait(timeout=_get_timeout(execparams))
    assert status is not None
    assert status.state == JobState.FAILED
    assert status.exit_code is not None
    assert status.exit_code != 0
Ejemplo n.º 3
0
def test_missing_executable(execparams: ExecutorTestParams) -> None:
    job = Job(JobSpec(executable='/bin/no_such_file_or_directory'))
    ex = _get_executor_instance(execparams, job)
    # we don't know if this will fail with an exception or JobState.FAILED,
    # so handle both
    try:
        ex.submit(job)
        status = job.wait(timeout=_get_timeout(execparams))
        assert status is not None
        assert status.state == JobState.FAILED
        if status.exit_code is not None:
            assert status.exit_code != 0
    except SubmitException:
        pass
Ejemplo n.º 4
0
    def submit(self, job: Job) -> None:
        """
        Submits the specified :class:`~psij.Job` to the pilot.

        Successful return of this method indicates that the job has been
        submitted to RP and all changes in the job status, including failures,
        are reported using notifications. If the job specification is invalid,
        an :class:`~psij.InvalidJobException` is thrown. If the actual
        submission fails for reasons outside the validity of the job,
        a :class:`~psij.SubmitException` is thrown.

        :param job: The job to be submitted.
        """
        spec = job.spec
        if not spec:
            raise InvalidJobException('Missing specification')

        job.executor = self
        try:
            td = self._job_2_descr(job)
            task = self._tmgr.submit_tasks(td)
            self._tasks[job.id] = (job, task)

        except Exception as ex:
            raise SubmitException('Failed to submit job') from ex
Ejemplo n.º 5
0
def test_simple_job_redirect(execparams: ExecutorTestParams) -> None:
    _make_test_dir()
    with TemporaryDirectory(dir=Path.home() / '.psij' / 'test') as td:
        outp = Path(td, 'stdout.txt')
        job = Job(
            JobSpec(executable='/bin/echo',
                    arguments=['-n', '_x_'],
                    stdout_path=outp))
        ex = _get_executor_instance(execparams, job)
        ex.submit(job)
        status = job.wait(timeout=_get_timeout(execparams))
        assert_completed(status)
        f = outp.open("r")
        contents = f.read()
        f.close()
        assert contents == '_x_'
Ejemplo n.º 6
0
    def _jobid_cb(self, job: Job, fut: flux.job.FluxExecutorFuture) -> None:
        """Callback triggered when Flux jobid is ready.

        Fetch the jobid, set it on the psij.Job, and set the the job to QUEUED.
        """
        job._native_id = fut.jobid()
        job_status = JobStatus(JobState.QUEUED, time=time.time())
        self._set_job_status(job, job_status)
Ejemplo n.º 7
0
    def submit(self, job: Job) -> None:
        """
        Submits the specified :class:`~psij.Job` to be run locally.

        Successful return of this method indicates that the job has been started locally and all
        changes in the job status, including failures, are reported using notifications. If the job
        specification is invalid, an :class:`~psij.InvalidJobException` is thrown. If
        the actual submission fails for reasons outside the validity of the job,
        a :class:`~psij.SubmitException` is thrown.

        :param job: The job to be submitted.
        """
        spec = job.spec
        if not spec:
            raise InvalidJobException('Missing specification')
        job.executor = self

        p = _ChildProcessEntry(
            job, self, self._get_launcher(self._get_launcher_name(spec)))
        assert p.launcher
        args = p.launcher.get_launch_command(job)

        try:
            with job._status_cv:
                if job.status.state == JobState.CANCELED:
                    raise SubmitException('Job canceled')
            logger.debug('Running %s,  out=%s, err=%s', args, spec.stdout_path,
                         spec.stderr_path)
            p.process = subprocess.Popen(args,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT,
                                         close_fds=True,
                                         cwd=spec.directory,
                                         env=_get_env(spec))
            self._reaper.register(p)
            job._native_id = p.process.pid
            self._set_job_status(
                job,
                JobStatus(JobState.QUEUED,
                          time=time.time(),
                          metadata={'nativeId': job._native_id}))
            self._set_job_status(job,
                                 JobStatus(JobState.ACTIVE, time=time.time()))
        except Exception as ex:
            raise SubmitException('Failed to submit job', exception=ex)
Ejemplo n.º 8
0
def test_env_var(execparams: ExecutorTestParams) -> None:
    _make_test_dir()
    with TemporaryDirectory(dir=Path.home() / '.psij' / 'test') as td:
        outp = Path(td, 'stdout.txt')
        job = Job(
            JobSpec(executable='/bin/bash',
                    arguments=['-c', 'echo -n $TEST_VAR'],
                    stdout_path=outp))
        assert job.spec is not None
        job.spec.environment = {'TEST_VAR': '_y_'}
        ex = _get_executor_instance(execparams, job)
        ex.submit(job)
        status = job.wait(timeout=_get_timeout(execparams))
        assert_completed(status)
        f = outp.open("r")
        contents = f.read()
        f.close()
        assert contents == '_y_'
Ejemplo n.º 9
0
def test_cancel(execparams: ExecutorTestParams) -> None:
    job = Job(JobSpec(executable='/bin/sleep', arguments=['60']))
    ex = _get_executor_instance(execparams, job)
    ex.submit(job)
    job.wait(target_states=[JobState.ACTIVE])
    job.cancel()
    status = job.wait(timeout=_get_timeout(execparams))
    assert status is not None
    assert status.state == JobState.CANCELED
Ejemplo n.º 10
0
    def attach(self, job: Job, native_id: str) -> None:
        """Attaches a job to a native job.

        Attempts to connect `job` to a native job with `native_id` such that the job correctly
        reflects updates to the status of the native job. If the native job was previously
        submitted using this executor (hence having an *exit code file* and a *script output file*),
        the executor will attempt to retrieve the exit code and errors from the job. Otherwise, it
        may be impossible for the executor to distinguish between a failed and successfully
        completed job.

        Parameters
        ----------
        job
            The PSI/J job to attach.
        native_id
            The id of the batch scheduler job to attach to.
        """
        job._native_id = native_id
        job.executor = self
        self._queue_poll_thread.register_job(job)
Ejemplo n.º 11
0
    def attach(self, job: Job, native_id: str) -> None:
        """
        Attaches a job to a process.

        The job must be in the :attr:`~psij.JobState.NEW` state.

        :param job: The job to attach.
        :param native_id: The native ID of the process to attached to, as
          obtained through :func:`~psij.executors.flux.FluxJobExecutor.list` method.
        """
        job.executor = self
        self._add_flux_callbacks(job, self._flux_executor.attach(native_id))
Ejemplo n.º 12
0
def test_attach(execparams: ExecutorTestParams) -> None:
    job = Job(JobSpec(executable='/bin/sleep', arguments=['1']))
    ex = _get_executor_instance(execparams, job)
    ex.submit(job)
    job.wait(target_states=[JobState.ACTIVE, JobState.COMPLETED])
    native_id = job.native_id

    assert native_id is not None
    job2 = Job()
    ex.attach(job2, native_id)
    status = job2.wait(timeout=_get_timeout(execparams))
    assert_completed(status)
Ejemplo n.º 13
0
def test_parallel_jobs(execparams: ExecutorTestParams) -> None:
    spec = JobSpec(executable='/bin/sleep', arguments=['5'])
    job1 = Job(spec)
    job2 = Job(spec)
    ex = _get_executor_instance(execparams, job1)
    ex.submit(job1)
    ex.submit(job2)
    status1 = job1.wait(timeout=_get_timeout(execparams))
    status2 = job2.wait(timeout=_get_timeout(execparams))
    assert_completed(status1)
    assert_completed(status2)
Ejemplo n.º 14
0
 def submit(self, job: Job) -> None:
     """See :func:`~psij.job_executor.JobExecutor.submit`."""
     assert job.spec
     assert job.spec.attributes
     job.executor = self
     if isinstance(job.spec.resources, ResourceSpecV1):
         resources = job.spec.resources
     elif isinstance(job.spec.resources, ResourceSpec):
         raise InvalidJobException(
             f"ResourceSpec version {job.spec.resources.version} not supported"
         )
     else:
         resources = ResourceSpecV1(process_count=1,
                                    cpu_cores_per_process=1)
     if resources.exclusive_node_use:
         warnings.warn(
             "Flux does not support exclusive_node_use=True, ignoring...",
             UserWarning,
         )
     if resources.processes_per_node:
         raise InvalidJobException(
             "Flux does not support processes_per_node")
     if not job.spec.executable:
         raise InvalidJobException("Job must have an executable")
     argv = list(job.spec.arguments) if job.spec.arguments else []
     argv.insert(0, job.spec.executable)
     flux_jobspec = flux.job.JobspecV1.from_command(
         argv,
         num_tasks=resources.process_count,
         cores_per_task=resources.cpu_cores_per_process,
         gpus_per_task=resources.gpu_cores_per_process,
         num_nodes=resources.node_count,
     )
     if job.spec.stdout_path:
         flux_jobspec.stdout = job.spec.stdout_path
     if job.spec.stdin_path:
         flux_jobspec.stdin = job.spec.stdin_path
     if job.spec.stderr_path:
         flux.jobspec.stderr = job.spec.stderr_path
     flux_jobspec.duration = job.spec.attributes.duration.total_seconds()
     fut = self._flux_executor.submit(flux_jobspec)
     self._add_flux_callbacks(job, fut)
Ejemplo n.º 15
0
    def attach(self, job: Job, native_id: str) -> None:
        """
        Attaches a job to a process.

        The job must be in the :attr:`~psij.job_state.JobState.NEW` state.

        :param job: The job to attach.
        :param native_id: The native ID of the process to attached to, as
            obtained through :func:`~list` method.
        """
        if job.status.state != JobState.NEW:
            raise InvalidJobException('Job must be in the NEW state')

        job.executor = self

        task = self._tmgr.get_tasks(uids=[native_id])[0]
        self._tasks[job.id] = (job, task)

        state = self._state_map[task.state]
        self._set_job_status(job, JobStatus(state, time=time.time()))
Ejemplo n.º 16
0
    def attach(self, job: Job, native_id: str) -> None:
        """
        Attaches a job to a process.

        The job must be in the :attr:`~psij.JobState.NEW` state. The exit code of the attached job
        will not be available upon completion and a zero exit code will always be returned for jobs
        attached by the `LocalJobExecutor`.

        :param job: The job to attach.
        :param native_id: The native ID of the process to attached to, as obtained through
            :func:`~psij.executors.LocalJobExecutor.list` method.
        """
        if job.status.state != JobState.NEW:
            raise InvalidJobException('Job must be in the NEW state')
        job.executor = self
        pid = int(native_id)

        self._reaper.register(
            _AttachedProcessEntry(job, psutil.Process(pid), self))
        # We assume that the native_id above is a PID that was obtained at some point using
        # list(). If so, the process is either still running or has completed. Either way, we must
        # bring it up to ACTIVE state
        self._set_job_status(job, JobStatus(JobState.QUEUED, time=time.time()))
        self._set_job_status(job, JobStatus(JobState.ACTIVE, time=time.time()))
Ejemplo n.º 17
0
def test_simple_job(execparams: ExecutorTestParams) -> None:
    job = Job(JobSpec(executable='/bin/date', launcher=execparams.launcher))
    ex = _get_executor_instance(execparams, job)
    ex.submit(job)
    status = job.wait(timeout=_get_timeout(execparams))
    assert_completed(status)
Ejemplo n.º 18
0
#!/usr/bin/env python3

# This is meant as a simple test file to check if psij was installed successfully

import sys

from psij import Job, JobExecutor, JobSpec

if __name__ == '__main__':
    name = 'local'
    url = None

    if len(sys.argv) > 1:
        name = sys.argv[1]

    if len(sys.argv) > 2:
        url = sys.argv[2]

    jobs = list()
    job = Job(JobSpec(executable='/bin/date'))
    ex = JobExecutor.get_instance(name=name, url=url)
    ex.submit(job)
    jobs.append(job)
    print('Job submitted')
    status = job.wait()
    print('Job done: {}'.format(status))