Example #1
0
    def submit(self, job):
        cmd = f'sbatch {job.script_filename}'
        intervals = itertools.cycle([1, 2, 3])
        while True:
            try:
                completed = _run_strict(cmd, timeout=self._submit_timeout)
                break
            except SpawnedProcessError as e:
                error_match = re.search(
                    rf'({"|".join(self._resubmit_on_errors)})', e.stderr)
                if not self._resubmit_on_errors or not error_match:
                    raise

                t = next(intervals)
                self.log(f'encountered a job submission error: '
                         f'{error_match.group(1)}: will resubmit after {t}s')
                time.sleep(t)

        jobid_match = re.search(r'Submitted batch job (?P<jobid>\d+)',
                                completed.stdout)
        if not jobid_match:
            raise JobSchedulerError(
                'could not retrieve the job id of the submitted job')

        job._jobid = jobid_match.group('jobid')
        job._submit_time = time.time()
Example #2
0
    def allnodes(self):
        try:
            completed = _run_strict('scontrol -a show -o nodes')
        except SpawnedProcessError as e:
            raise JobSchedulerError(
                'could not retrieve node information') from e

        node_descriptions = completed.stdout.splitlines()
        return _create_nodes(node_descriptions)
Example #3
0
 def submit(self, job):
     with open(job.script_filename, 'r') as fp:
         completed = _run_strict('bsub', stdin=fp)
     jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted',
                             completed.stdout)
     if not jobid_match:
         raise JobSchedulerError('could not retrieve the job id '
                                 'of the submitted job')
     job._jobid = jobid_match.group('jobid')
     job._submit_time = time.time()
Example #4
0
    def submit(self, job):
        cmd = f'bsub {job.script_filename}'
        completed = _run_strict(cmd, timeout=self._submit_timeout)
        jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted',
                                completed.stdout)
        if not jobid_match:
            raise JobSchedulerError('could not retrieve the job id '
                                    'of the submitted job')

        job._jobid = jobid_match.group('jobid')
        job._submit_time = time.time()
Example #5
0
    def submit(self, job):
        # `-o` and `-e` options are only recognized in command line by the PBS
        # Slurm wrappers.
        cmd = f'qsub -o {job.stdout} -e {job.stderr} {job.script_filename}'
        completed = _run_strict(cmd, timeout=self._submit_timeout)
        jobid_match = re.search(r'^(?P<jobid>\S+)', completed.stdout)
        if not jobid_match:
            raise JobSchedulerError('could not retrieve the job id '
                                    'of the submitted job')

        job._jobid = jobid_match.group('jobid')
        job._submit_time = time.time()
Example #6
0
    def _get_reservation_nodes(self, reservation):
        completed = _run_strict('scontrol -a show res %s' % reservation)
        node_match = re.search(r'(Nodes=\S+)', completed.stdout)
        if node_match:
            reservation_nodes = node_match[1]
        else:
            raise JobSchedulerError("could not extract the node names for "
                                    "reservation '%s'" % reservation)

        completed = _run_strict('scontrol -a show -o %s' % reservation_nodes)
        node_descriptions = completed.stdout.splitlines()
        return _create_nodes(node_descriptions)
Example #7
0
    def __init__(self, node_descr):
        self._name = self._extract_attribute('NodeName', node_descr)
        if not self._name:
            raise JobSchedulerError(
                'could not extract NodeName from node description')

        self._partitions = self._extract_attribute(
            'Partitions', node_descr, sep=',') or set()
        self._active_features = self._extract_attribute(
            'ActiveFeatures', node_descr, sep=',') or set()
        self._states = self._extract_attribute('State', node_descr,
                                               sep='+') or set()
        self._descr = node_descr
Example #8
0
    def submit(self, job):
        # OAR batch submission mode needs full path to the job script
        job_script_fullpath = os.path.join(job.workdir, job.script_filename)

        # OAR needs -S to submit job in batch mode
        cmd = f'oarsub -S {job_script_fullpath}'
        completed = _run_strict(cmd, timeout=self._submit_timeout)
        jobid_match = re.search(r'.*OAR_JOB_ID=(?P<jobid>\S+)',
                                completed.stdout)
        if not jobid_match:
            raise JobSchedulerError('could not retrieve the job id '
                                    'of the submitted job')

        job._jobid = jobid_match.group('jobid')
        job._submit_time = time.time()
Example #9
0
    def poll(self, *jobs):
        def output_ready(job):
            # We report a job as finished only when its stdout/stderr are
            # written back to the working directory
            stdout = os.path.join(job.workdir, job.stdout)
            stderr = os.path.join(job.workdir, job.stderr)
            return os.path.exists(stdout) and os.path.exists(stderr)

        if jobs:
            # Filter out non-jobs
            jobs = [job for job in jobs if job is not None]

        if not jobs:
            return

        completed = osext.run_command(
            f'qstat -f {" ".join(job.jobid for job in jobs)}')

        # Depending on the configuration, completed jobs will remain on the job
        # list for a limited time, or be removed upon completion.
        # If qstat cannot find any of the job IDs, it will return 153.
        # Otherwise, it will return with return code 0 and print information
        # only for the jobs it could find.
        if completed.returncode in (153, 35):
            self.log(f'Return code is {completed.returncode}')
            for job in jobs:
                job._state = 'COMPLETED'
                if job.cancelled or output_ready(job):
                    self.log(f'Assuming job {job.jobid} completed')
                    job._completed = True

            return

        if completed.returncode != 0:
            raise JobSchedulerError(
                f'qstat failed with exit code {completed.returncode} '
                f'(standard error follows):\n{completed.stderr}')

        # Store information for each job separately
        jobinfo = {}
        for job_raw_info in completed.stdout.split('\n\n'):
            jobid_match = re.search(r'^Job Id:\s*(?P<jobid>\S+)', job_raw_info,
                                    re.MULTILINE)
            if jobid_match:
                jobid = jobid_match.group('jobid')
                jobinfo[jobid] = job_raw_info

        for job in jobs:
            if job.jobid not in jobinfo:
                self.log(f'Job {job.jobid} not known to scheduler')
                job._state = 'COMPLETED'
                if job.cancelled or output_ready(job):
                    self.log(f'Assuming job {job.jobid} completed')
                    job._completed = True

                continue

            info = jobinfo[job.jobid]
            state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])', info,
                                    re.MULTILINE)
            if not state_match:
                self.log(f'Job state not found (job info follows):\n{info}')
                continue

            state = state_match.group('state')
            job._state = JOB_STATES[state]
            nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)',
                                       info, re.MULTILINE)
            if nodelist_match:
                nodespec = nodelist_match.group('nodespec')
                nodespec = re.sub(r'[\n\t]*', '', nodespec)
                self._update_nodelist(job, nodespec)

            if job.state == 'COMPLETED':
                exitcode_match = re.search(
                    r'^\s*exit_status = (?P<code>\d+)',
                    info,
                    re.MULTILINE,
                )
                if exitcode_match:
                    job._exitcode = int(exitcode_match.group('code'))

                # We report a job as finished only when its stdout/stderr are
                # written back to the working directory
                done = job.cancelled or output_ready(job)
                if done:
                    job._completed = True
            elif (job.state in ['QUEUED', 'HELD', 'WAITING']
                  and job.max_pending_time):
                if (time.time() - job.submit_time >= job.max_pending_time):
                    self.cancel(job)
                    job._exception = JobError('maximum pending time exceeded',
                                              job.jobid)
Example #10
0
    def poll(self, *jobs):
        if jobs:
            # Filter out non-jobs
            jobs = [job for job in jobs if job is not None]

        if not jobs:
            return

        user = osext.osuser()
        completed = osext.run_command(f'qstat -xml -u {user}')
        if completed.returncode != 0:
            raise JobSchedulerError(
                f'qstat failed with exit code {completed.returncode} '
                f'(standard error follows):\n{completed.stderr}')

        # Index the jobs to poll on their jobid
        jobs_to_poll = {job.jobid: job for job in jobs}

        # Parse the XML
        root = ET.fromstring(completed.stdout)

        # We are iterating over the returned XML and update the status of the
        # jobs relevant to ReFrame; the naming convention of variables matches
        # that of SGE's XML output

        known_jobs = set()  # jobs known to the SGE scheduler
        for queue_info in root:
            # Reads the XML and prints jobs with status belonging to user.
            if queue_info is None:
                raise JobSchedulerError('could not retrieve queue information')

            for job_list in queue_info:
                if job_list.find("JB_owner").text != user:
                    # Not a job of this user.
                    continue

                jobid = job_list.find("JB_job_number").text
                if jobid not in jobs_to_poll:
                    # Not a reframe job
                    continue

                state = job_list.find("state").text
                job = jobs_to_poll[jobid]
                known_jobs.add(job)

                # For the list of known statuses see `man 5 sge_status`
                # (https://arc.liv.ac.uk/SGE/htmlman/htmlman5/sge_status.html)
                if state in ['r', 'hr', 't', 'Rr', 'Rt']:
                    job._state = 'RUNNING'
                elif state in ['qw', 'Rq', 'hqw', 'hRwq']:
                    job._state = 'PENDING'
                elif state in [
                        's', 'ts', 'S', 'tS', 'T', 'tT', 'Rs', 'Rts', 'RS',
                        'RtS', 'RT', 'RtT'
                ]:
                    job._state = 'SUSPENDED'
                elif state in ['Eqw', 'Ehqw', 'EhRqw']:
                    job._state = 'ERROR'
                elif state in [
                        'dr', 'dt', 'dRr', 'dRt', 'ds', 'dS', 'dT', 'dRs',
                        'dRS', 'dRT'
                ]:
                    job._state = 'DELETING'
                elif state == 'z':
                    job._state = 'COMPLETED'

        # Mark any "unknown" job as completed
        unknown_jobs = set(jobs) - known_jobs
        for job in unknown_jobs:
            self.log(f'Job {job.jobid} not known to scheduler, '
                     f'assuming job completed')
            job._state = 'COMPLETED'