def submit(self, job): cmd = f'sbatch {job.script_filename}' intervals = itertools.cycle([1, 2, 3]) while True: try: completed = _run_strict(cmd, timeout=self._submit_timeout) break except SpawnedProcessError as e: error_match = re.search( rf'({"|".join(self._resubmit_on_errors)})', e.stderr) if not self._resubmit_on_errors or not error_match: raise t = next(intervals) self.log(f'encountered a job submission error: ' f'{error_match.group(1)}: will resubmit after {t}s') time.sleep(t) jobid_match = re.search(r'Submitted batch job (?P<jobid>\d+)', completed.stdout) if not jobid_match: raise JobSchedulerError( 'could not retrieve the job id of the submitted job') job._jobid = jobid_match.group('jobid') job._submit_time = time.time()
def allnodes(self): try: completed = _run_strict('scontrol -a show -o nodes') except SpawnedProcessError as e: raise JobSchedulerError( 'could not retrieve node information') from e node_descriptions = completed.stdout.splitlines() return _create_nodes(node_descriptions)
def submit(self, job): with open(job.script_filename, 'r') as fp: completed = _run_strict('bsub', stdin=fp) jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted', completed.stdout) if not jobid_match: raise JobSchedulerError('could not retrieve the job id ' 'of the submitted job') job._jobid = jobid_match.group('jobid') job._submit_time = time.time()
def submit(self, job): cmd = f'bsub {job.script_filename}' completed = _run_strict(cmd, timeout=self._submit_timeout) jobid_match = re.search(r'^Job <(?P<jobid>\S+)> is submitted', completed.stdout) if not jobid_match: raise JobSchedulerError('could not retrieve the job id ' 'of the submitted job') job._jobid = jobid_match.group('jobid') job._submit_time = time.time()
def submit(self, job): # `-o` and `-e` options are only recognized in command line by the PBS # Slurm wrappers. cmd = f'qsub -o {job.stdout} -e {job.stderr} {job.script_filename}' completed = _run_strict(cmd, timeout=self._submit_timeout) jobid_match = re.search(r'^(?P<jobid>\S+)', completed.stdout) if not jobid_match: raise JobSchedulerError('could not retrieve the job id ' 'of the submitted job') job._jobid = jobid_match.group('jobid') job._submit_time = time.time()
def _get_reservation_nodes(self, reservation): completed = _run_strict('scontrol -a show res %s' % reservation) node_match = re.search(r'(Nodes=\S+)', completed.stdout) if node_match: reservation_nodes = node_match[1] else: raise JobSchedulerError("could not extract the node names for " "reservation '%s'" % reservation) completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) node_descriptions = completed.stdout.splitlines() return _create_nodes(node_descriptions)
def __init__(self, node_descr): self._name = self._extract_attribute('NodeName', node_descr) if not self._name: raise JobSchedulerError( 'could not extract NodeName from node description') self._partitions = self._extract_attribute( 'Partitions', node_descr, sep=',') or set() self._active_features = self._extract_attribute( 'ActiveFeatures', node_descr, sep=',') or set() self._states = self._extract_attribute('State', node_descr, sep='+') or set() self._descr = node_descr
def submit(self, job): # OAR batch submission mode needs full path to the job script job_script_fullpath = os.path.join(job.workdir, job.script_filename) # OAR needs -S to submit job in batch mode cmd = f'oarsub -S {job_script_fullpath}' completed = _run_strict(cmd, timeout=self._submit_timeout) jobid_match = re.search(r'.*OAR_JOB_ID=(?P<jobid>\S+)', completed.stdout) if not jobid_match: raise JobSchedulerError('could not retrieve the job id ' 'of the submitted job') job._jobid = jobid_match.group('jobid') job._submit_time = time.time()
def poll(self, *jobs): def output_ready(job): # We report a job as finished only when its stdout/stderr are # written back to the working directory stdout = os.path.join(job.workdir, job.stdout) stderr = os.path.join(job.workdir, job.stderr) return os.path.exists(stdout) and os.path.exists(stderr) if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] if not jobs: return completed = osext.run_command( f'qstat -f {" ".join(job.jobid for job in jobs)}') # Depending on the configuration, completed jobs will remain on the job # list for a limited time, or be removed upon completion. # If qstat cannot find any of the job IDs, it will return 153. # Otherwise, it will return with return code 0 and print information # only for the jobs it could find. if completed.returncode in (153, 35): self.log(f'Return code is {completed.returncode}') for job in jobs: job._state = 'COMPLETED' if job.cancelled or output_ready(job): self.log(f'Assuming job {job.jobid} completed') job._completed = True return if completed.returncode != 0: raise JobSchedulerError( f'qstat failed with exit code {completed.returncode} ' f'(standard error follows):\n{completed.stderr}') # Store information for each job separately jobinfo = {} for job_raw_info in completed.stdout.split('\n\n'): jobid_match = re.search(r'^Job Id:\s*(?P<jobid>\S+)', job_raw_info, re.MULTILINE) if jobid_match: jobid = jobid_match.group('jobid') jobinfo[jobid] = job_raw_info for job in jobs: if job.jobid not in jobinfo: self.log(f'Job {job.jobid} not known to scheduler') job._state = 'COMPLETED' if job.cancelled or output_ready(job): self.log(f'Assuming job {job.jobid} completed') job._completed = True continue info = jobinfo[job.jobid] state_match = re.search(r'^\s*job_state = (?P<state>[A-Z])', info, re.MULTILINE) if not state_match: self.log(f'Job state not found (job info follows):\n{info}') continue state = state_match.group('state') job._state = JOB_STATES[state] nodelist_match = re.search(r'exec_host = (?P<nodespec>[\S\t\n]+)', info, re.MULTILINE) if nodelist_match: nodespec = nodelist_match.group('nodespec') nodespec = re.sub(r'[\n\t]*', '', nodespec) self._update_nodelist(job, nodespec) if job.state == 'COMPLETED': exitcode_match = re.search( r'^\s*exit_status = (?P<code>\d+)', info, re.MULTILINE, ) if exitcode_match: job._exitcode = int(exitcode_match.group('code')) # We report a job as finished only when its stdout/stderr are # written back to the working directory done = job.cancelled or output_ready(job) if done: job._completed = True elif (job.state in ['QUEUED', 'HELD', 'WAITING'] and job.max_pending_time): if (time.time() - job.submit_time >= job.max_pending_time): self.cancel(job) job._exception = JobError('maximum pending time exceeded', job.jobid)
def poll(self, *jobs): if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] if not jobs: return user = osext.osuser() completed = osext.run_command(f'qstat -xml -u {user}') if completed.returncode != 0: raise JobSchedulerError( f'qstat failed with exit code {completed.returncode} ' f'(standard error follows):\n{completed.stderr}') # Index the jobs to poll on their jobid jobs_to_poll = {job.jobid: job for job in jobs} # Parse the XML root = ET.fromstring(completed.stdout) # We are iterating over the returned XML and update the status of the # jobs relevant to ReFrame; the naming convention of variables matches # that of SGE's XML output known_jobs = set() # jobs known to the SGE scheduler for queue_info in root: # Reads the XML and prints jobs with status belonging to user. if queue_info is None: raise JobSchedulerError('could not retrieve queue information') for job_list in queue_info: if job_list.find("JB_owner").text != user: # Not a job of this user. continue jobid = job_list.find("JB_job_number").text if jobid not in jobs_to_poll: # Not a reframe job continue state = job_list.find("state").text job = jobs_to_poll[jobid] known_jobs.add(job) # For the list of known statuses see `man 5 sge_status` # (https://arc.liv.ac.uk/SGE/htmlman/htmlman5/sge_status.html) if state in ['r', 'hr', 't', 'Rr', 'Rt']: job._state = 'RUNNING' elif state in ['qw', 'Rq', 'hqw', 'hRwq']: job._state = 'PENDING' elif state in [ 's', 'ts', 'S', 'tS', 'T', 'tT', 'Rs', 'Rts', 'RS', 'RtS', 'RT', 'RtT' ]: job._state = 'SUSPENDED' elif state in ['Eqw', 'Ehqw', 'EhRqw']: job._state = 'ERROR' elif state in [ 'dr', 'dt', 'dRr', 'dRt', 'ds', 'dS', 'dT', 'dRs', 'dRS', 'dRT' ]: job._state = 'DELETING' elif state == 'z': job._state = 'COMPLETED' # Mark any "unknown" job as completed unknown_jobs = set(jobs) - known_jobs for job in unknown_jobs: self.log(f'Job {job.jobid} not known to scheduler, ' f'assuming job completed') job._state = 'COMPLETED'