def check_links_status(self, fail_running=False, fail_pending=False): """"Check the status of all the jobs run from the `Link` objects in this `Chain` and return a status flag that summarizes that. Parameters ---------- fail_running : `bool` If True, consider running jobs as failed fail_pending : `bool` If True, consider pending jobs as failed Returns ------- status : `JobStatus` Job status flag that summarizes the status of all the jobs, """ status_vector = JobStatusVector() for link in self._links.values(): key = JobDetails.make_fullkey(link.full_linkname) link_status = link.check_job_status(key, fail_running=fail_running, fail_pending=fail_pending) status_vector[link_status] += 1 return status_vector.get_status()
def _check_link_completion(self, link, fail_pending=False, fail_running=False): """Internal function to check the completion of all the dispatched jobs Returns ------- status_vect : `JobStatusVector` Vector that summarize the number of jobs in various states. """ status_vect = JobStatusVector() for job_key, job_details in link.jobs.items(): # if job_details.status == JobStatus.failed: # failed = True # continue # elif job_details.status == JobStatus.done: # continue if job_key.find(JobDetails.topkey) >= 0: continue job_details.status = self._interface.check_job(job_details) if job_details.status == JobStatus.pending: if fail_pending: job_details.status = JobStatus.failed elif job_details.status == JobStatus.running: if fail_running: job_details.status = JobStatus.failed status_vect[job_details.status] += 1 link.jobs[job_key] = job_details link._set_status_self(job_details.jobkey, job_details.status) return status_vect
def _invoke(self, argv, stream=sys.stdout, resubmit_failed=False): """Invoke this object to preform a particular action Parameters ---------- argv : list List of command line arguments, passed to helper classes stream : `file` Stream that this function will print to, must have 'write' function. resubmit_failed : bool Resubmit failed jobs. Returns ------- status_vect : `JobStatusVector` Vector that summarize the number of jobs in various states. """ args = self._run_argparser(argv) if args.action not in ACTIONS: sys.stderr.write("Unrecognized action %s, options are %s\n" % (args.action, ACTIONS)) if args.action == 'skip': return JobStatus.no_job elif args.action in ['run', 'resubmit', 'check_status', 'config']: self._job_configs = self.build_job_configs(args.__dict__) self._interface._dry_run = args.dry_run if args.action == 'run': status_vect = self.run_jobs(stream, resubmit_failed=resubmit_failed) elif args.action == 'resubmit': status_vect = self.resubmit(stream, resubmit_failed=resubmit_failed) elif args.action == 'check_status': self._build_job_dict() status_vect = self.check_status(stream) elif args.action == 'config': self._build_job_dict() status_vect = JobStatusVector() status_vect[JobStatus.done] += 1 return status_vect
def print_update(self, stream=sys.stdout, job_stats=None): """Print an update about the current number of jobs running """ if job_stats is None: job_stats = JobStatusVector() job_det_list = [] job_det_list += self._scatter_link.jobs.values() for job_dets in job_det_list: if job_dets.status == JobStatus.no_job: continue job_stats[job_dets.status] += 1 stream.write("Status :\n Total : %i\n Unknown: %i\n" % (job_stats.n_total, job_stats[JobStatus.unknown])) stream.write( " Not Ready: %i\n Ready: %i\n" % (job_stats[JobStatus.not_ready], job_stats[JobStatus.ready])) stream.write( " Pending: %i\n Running: %i\n" % (job_stats[JobStatus.pending], job_stats[JobStatus.running])) stream.write(" Done: %i\n Failed: %i\n" % (job_stats[JobStatus.done], job_stats[JobStatus.failed]))
def check_status(self, stream=sys.stdout, check_once=False, fail_pending=False, fail_running=False, no_wait=False, do_print=True, write_status=False): """Loop to check on the status of all the jobs in job dict. Parameters ----------- stream : `file` Stream that this function will print to, Must have 'write' function. check_once : bool Check status once and exit loop. fail_pending : `bool` If True, consider pending jobs as failed fail_running : `bool` If True, consider running jobs as failed no_wait : bool Do not sleep before checking jobs. do_print : bool Print summary stats. write_status : bool Write the status the to log file. Returns ------- status_vect : `JobStatusVector` Vector that summarize the number of jobs in various states. """ running = True first = True if not check_once: if stream != sys.stdout: sys.stdout.write('Checking status (%is): ' % self.args['job_check_sleep']) sys.stdout.flush() status_vect = JobStatusVector() while running: if first: first = False elif self.args['dry_run']: break elif no_wait: pass else: stream.write("Sleeping %.0f seconds between status checks\n" % self.args['job_check_sleep']) if stream != sys.stdout: sys.stdout.write('.') sys.stdout.flush() time.sleep(self.args['job_check_sleep']) status_vect = self._check_link_completion(self._scatter_link, fail_pending, fail_running) if self.args['check_status_once'] or check_once or no_wait: if do_print: self.print_update(stream, status_vect) break if self.args['print_update']: if do_print: self.print_update(stream, status_vect) if self._job_archive is not None: self._job_archive.write_table_file() n_total = status_vect.n_total n_done = status_vect.n_done n_failed = status_vect.n_failed if n_done + n_failed == n_total: running = False status = status_vect.get_status() if status in [JobStatus.failed, JobStatus.partial_failed]: if do_print: self.print_update(stream, status_vect) self.print_failed(stream) if write_status: self._write_status_to_log(status, stream) else: if write_status: self._write_status_to_log(0, stream) self._set_status_self(status=status) if not check_once: if stream != sys.stdout: sys.stdout.write("! %s\n" % (JOB_STATUS_STRINGS[status])) if self._job_archive is not None: self._job_archive.write_table_file() return status_vect