Beispiel #1
0
    def _cancel_job(self, test):
        """Scancel the job attached to the given test.

        :param pavilion.test_run.TestRun test: The test to cancel.
        :returns: A statusInfo object with the latest scheduler state.
        :rtype: StatusInfo
        """

        cmd = ['scancel', test.job_id]

        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()

        if proc.poll() == 0:
            # Scancel successful, pass the stdout message

            msg = "Slurm jobid {} canceled via slurm.".format(test.job_id)
            # Someday I'll add a method to do this in one shot.
            test.status.set(STATES.SCHED_CANCELLED, msg)
            test.set_run_complete()
            return StatusInfo(STATES.SCHED_CANCELLED, msg)
        else:
            test.status.set(
                STATES.SCHED_CANCELLED,
                "Tried (but failed) to cancel job: {}".format(stderr))
            # Scancel failed, pass the stderr message
            return StatusInfo(STATES.SCHED_ERROR, stderr)
Beispiel #2
0
    def _cancel_job(self, test):
        """Scancel the job.
        :param pavilion.pav_test.PavTest test: The test to cancel.
        """

        # TODO: check this

        cmd = ['scancel', test.job_id]

        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()

        if proc.poll() == 0:
            # Scancel successful, pass the stdout message

            msg = "Slurm jobid {} canceled via slurm.".format(test.job_id)
            # Someday I'll add a method to do this in one shot.
            test.status.set(
                STATES.SCHED_CANCELLED,
                msg
            )
            return StatusInfo(
                STATES.SCHED_CANCELLED,
                msg
            )
        else:
            # Scancel failed, pass the stderr message
            return StatusInfo(
                STATES.SCHED_ERROR,
                stderr
            )
Beispiel #3
0
    def job_status(self, pav_cfg, test):
        """Raw jobs will either be scheduled (waiting on a concurrency
        lock), or in an unknown state (as there aren't records of dead jobs).

        :rtype: StatusInfo
        """

        host, pid = test.job_id.rsplit('_', 1)

        now = time.time()

        local_host = socket.gethostname()
        if host != local_host:
            return StatusInfo(
                when=time.time(),
                state=STATES.SCHEDULED,
                note=("Can't determine the scheduler status of a 'raw' "
                      "test started on a different host ({} vs {}).".format(
                          host, local_host)))

        cmd_fn = Path('/proc') / pid / 'cmdline'
        cmdline = None

        if cmd_fn.exists():
            try:
                with cmd_fn.open('rb') as cmd_file:
                    cmdline = cmd_file.read()
            except (IOError, OSError):
                pass

        if cmdline is not None:
            cmdline = cmdline.replace(b'\x00', b' ').decode('utf8')

            # Make sure we're looking at the same job.
            if ('kickoff.sh' in cmdline and '-{}-'.format(test.id) in cmdline):
                return StatusInfo(
                    when=now,
                    state=STATES.SCHEDULED,
                    note="Process is running, and probably waiting on a "
                    "concurrency lock.")

        # The command isn't running because it completed, died, or was killed.
        # Recheck the status file for changes, otherwise call it an error.
        status = test.status.current()
        if status.state != STATES.SCHEDULED:
            return status
        else:
            msg = ("Job died or was killed. Check '{}' for more info.".format(
                test.path / 'kickoff.out'))
            test.status.set(STATES.SCHED_ERROR, msg)
            return StatusInfo(when=now, state=STATES.SCHED_ERROR, note=msg)
Beispiel #4
0
    def _cancel_job(self, test):
        """Try to kill the given test's pid (if it is the right pid).

        :param pavilion.test_run.TestRun test: The test to cancel.
        """

        host, pid = test.job_id.rsplit('_', 1)

        hostname = socket.gethostname()
        if host != hostname:
            return StatusInfo(
                STATES.SCHED_ERROR,
                "Job started on different host ({}).".format(hostname))

        if not self._verify_pid(pid, test.id):
            # Test was no longer running, just return it's current state.
            return test.status.current()

        try:
            os.kill(int(pid), signal.SIGTERM)
        except PermissionError:
            return StatusInfo(
                STATES.SCHED_ERROR,
                "You don't have permission to kill PID {}".format(pid))
        except OSError as err:
            return StatusInfo(
                STATES.SCHED_ERROR,
                "Unexpected error cancelling job {}: {}".format(pid, str(err)))

        timeout = time.time() + self.CANCEL_TIMEOUT
        while self._verify_pid(pid, test.id) and time.time() < timeout:
            time.sleep(.1)

        if not self._verify_pid(pid, test.id):
            test.status.set(STATES.SCHED_CANCELLED, "Canceled via pavilion.")
            test.set_run_complete()
            return StatusInfo(STATES.SCHED_CANCELLED,
                              "PID {} was terminated.".format(pid))
        else:
            return StatusInfo(STATES.SCHED_ERROR,
                              "PID {} refused to die.".format(pid))
    def cancel_job(self, test):
        """Tell the scheduler to cancel the given test, if it can. This should
        simply try it's best for the test given, and note in the test status
        (with a SCHED_ERROR) if there were problems. Update the test status to
        SCHED_CANCELLED if it succeeds.
        :param pavilion.pav_test.PavTest test: The test to cancel.
        :returns: A status info object describing the state. If we actually
            cancel the job the test status will be set to SCHED_CANCELLED.
            This should return SCHED_ERROR when something goes wrong.
        :rtype: StatusInfo
        """

        job_id = test.job_id
        if job_id is None:
            return StatusInfo(STATES.SCHED_CANCELLED, "Job was never started.")

        return self._cancel_job(test)
Beispiel #6
0
    def job_status(self, pav_cfg, test):
        """Get the current status of the slurm job for the given test."""

        try:
            job_info = self._scontrol_show('job', test.job_id)
        except ValueError as err:
            return StatusInfo(state=STATES.SCHED_ERROR,
                              note=str(err),
                              when=self._now())

        if not job_info:
            return StatusInfo(state=STATES.SCHED_ERROR,
                              note="Could not find job {}".format(test.job_id),
                              when=self._now())

        # scontrol show returns a list. There should only be one item in that
        # list though.
        job_info = job_info.pop(0)
        if job_info:
            self.logger.info("Extra items in show job output: %s", job_info)

        job_state = job_info.get('JobState', 'UNKNOWN')
        if job_state in self.SCHED_WAITING:
            return StatusInfo(
                state=STATES.SCHEDULED,
                note=("Job {} has state '{}', reason '{}'".format(
                    test.job_id, job_state, job_info.get('Reason'))),
                when=self._now())
        elif job_state in self.SCHED_RUN:
            # The job should be running. Check it's status again.
            status = test.status.current()
            if status.state != STATES.SCHEDULED:
                return status
            else:
                return StatusInfo(
                    state=STATES.SCHEDULED,
                    note=("Job is running or about to run. Has job state {}".
                          format(job_state)),
                    when=self._now())
        elif job_state in self.SCHED_ERROR:
            # The job should have run enough to change it's state, but
            # might not have.
            status = test.status.current()
            if status.state != STATES.SCHEDULED:
                return status
            else:
                return test.status.set(
                    STATES.SCHED_ERROR,
                    "The scheduler killed the job, it has job state '{}'".
                    format(job_state))

        elif job_state in self.SCHED_CANCELLED:
            # The job appears to have been cancelled without running.

            test.set_run_complete()
            return test.status.set(
                STATES.SCHED_CANCELLED,
                "Job cancelled, has job state '{}'".format(job_state))

        self.logger.warning(
            "Encountered unhandled job state '%s' for"
            "job '%s'.", job_state, test.job_id)
        # The best we can say is that the test is still SCHEDULED. After all,
        # it might be! Who knows.
        return StatusInfo(
            state=STATES.SCHEDULED,
            note="Job '{}' has unknown/unhandled job state '{}'. We have no"
            "idea what is going on.".format(test.job_id, job_state),
            when=self._now())