def serialize(self, value): """""" if not value: value = 0 value_seconds = parse_duration(value, input_unit=self.unit, unit="s") return human_duration(seconds=value_seconds, colon_format=True)
def send(error, transports, t0): # do nothing when there are no transports if not transports: return # do nothing on KeyboardInterrupt, or when on_success / on_failure do not match the status success = error is None if isinstance(error, KeyboardInterrupt): return elif success and not opts["on_success"]: return elif not success and not opts["on_failure"]: return # prepare message content duration = human_duration(seconds=round(time.time() - t0, 1)) status_string = "succeeded" if success else "failed" title = "Task {} {}!".format(_task.get_task_family(), status_string) parts = collections.OrderedDict([ ("Host", socket.gethostname()), ("Duration", duration), ("Last message", "-" if not len(_task._message_cache) else _task._message_cache[-1]), ("Task", str(_task)), ]) if not success: parts["Traceback"] = traceback.format_exc() message = "\n".join("{}: {}".format(*tpl) for tpl in parts.items()) # dispatch via all transports for transport in transports: fn = transport["func"] raw = transport.get("raw", False) colored = transport.get("colored", False) # remove color commands if necessary if not colored: _title = uncolored(title) if raw: _content = { k: (uncolored(v) if isinstance(v, six.string_types) else v) for k, v in parts.items() } else: _content = uncolored(message) else: _title = title _content = parts.copy() if raw else message # invoke the function try: fn(success, _title, _content, **opts) except Exception as e: t = traceback.format_exc() logger.warning( "notification via transport '{}' failed: {}\n{}".format( fn, e, t))
def log_duration(t0): duration = human_duration(seconds=round(time.time() - t0, 1)) # log timeit_logger = logger.getChild("timeit") timeit_logger.info("runtime of {}: {}".format(task.task_id, duration)) # optionally publish a task message to the scheduler if opts["publish_message"] and callable(getattr(task, "publish_message", None)): task.publish_message("runtime: {}".format(duration))
def publish_step(self, msg, success_message="done", fail_message="failed", runtime=True, scheduler=True): self.publish_message(msg, scheduler=scheduler) success = False t0 = time.time() try: yield success = True finally: msg = success_message if success else fail_message if runtime: diff = time.time() - t0 msg = "{} (took {})".format(msg, human_duration(seconds=diff)) self.publish_message(msg, scheduler=scheduler)
def renew_voms_proxy(password="", vo=None, lifetime="8 days", proxy_file=None): """ Renews the voms proxy using a password *password*, an optional virtual organization name *vo*, and a default *lifetime* of 8 days, which is internally parsed by :py:func:`law.util.parse_duration` where the default input unit is hours. To ensure that the *password* is not visible in any process listing, it is written to a temporary file first and piped into the ``voms-proxy-init`` command. When *proxy_file* is *None*, it defaults to the result of :py:func:`get_voms_proxy_file`. """ # parse and format the lifetime lifetime_seconds = max(parse_duration(lifetime, input_unit="h", unit="s"), 60.0) lifetime = human_duration(seconds=lifetime_seconds, colon_format="h") # cut the seconds part normalized = ":".join((2 - lifetime.count(":")) * ["00"] + [""]) + lifetime lifetime = ":".join(normalized.rsplit(":", 3)[-3:-1]) # when proxy_file is None, get the default # when empty string, don't add a --out argument if proxy_file is None: proxy_file = get_voms_proxy_file() with tmp_file() as (_, tmp): with open(tmp, "w") as f: f.write(password) cmd = "cat '{}' | voms-proxy-init --valid '{}'".format(tmp, lifetime) if vo: cmd += " -voms '{}'".format(vo) if proxy_file: proxy_file = os.path.expandvars(os.path.expanduser(proxy_file)) cmd += " --out '{}'".format(proxy_file) code, out, _ = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if code != 0: raise Exception("voms-proxy-init failed: {}".format(out))
def log_duration(t0): duration = human_duration(seconds=round(perf_counter() - t0, 1)) task.logger.info("runtime: {}".format(duration))
def poll(self): """ Initiates the job status polling loop. """ task = self.task # total job count n_jobs = len(self.submission_data) # track finished and failed jobs in dicts holding status data finished_jobs = OrderedDict() failed_jobs = OrderedDict() # track number of consecutive polling failures and the start time n_poll_fails = 0 start_time = time.time() # get job kwargs for status querying query_kwargs = self._get_job_kwargs("query") # start the poll loop i = -1 while True: i += 1 # sleep after the first iteration if i > 0: time.sleep(task.poll_interval * 60) # handle scheduler messages, which could change task some parameters task._handle_scheduler_messages() # walltime exceeded? if task.walltime != NO_FLOAT and (time.time() - start_time) > task.walltime * 3600: raise Exception("exceeded walltime: {}".format(human_duration(hours=task.walltime))) # update variable attributes for polling self.poll_data.n_finished_min = task.acceptance * (1 if task.acceptance > 1 else n_jobs) self.poll_data.n_failed_max = task.tolerance * (1 if task.tolerance > 1 else n_jobs) # determine the currently active jobs, i.e., the jobs whose states should be checked, # and also store jobs whose ids are unknown active_jobs = OrderedDict() unknown_jobs = OrderedDict() for job_num, data in six.iteritems(self.submission_data.jobs): if job_num in finished_jobs or job_num in failed_jobs: continue elif self._can_skip_job(job_num, data["branches"]): finished_jobs[job_num] = self.status_data_cls.job_data( status=self.job_manager.FINISHED, code=0) else: data = data.copy() if data["job_id"] in (None, self.status_data_cls.dummy_job_id): data["job_id"] = self.status_data_cls.dummy_job_id unknown_jobs[job_num] = data else: active_jobs[job_num] = data self.poll_data.n_active = len(active_jobs) + len(unknown_jobs) # query job states job_ids = [data["job_id"] for data in six.itervalues(active_jobs)] # noqa: F812 query_data = self.job_manager.query_batch(job_ids, **query_kwargs) # separate into actual states and errors that might have occured during the status query states_by_id = {} errors = [] for job_id, state_or_error in six.iteritems(query_data): if isinstance(state_or_error, Exception): errors.append(state_or_error) else: states_by_id[job_id] = state_or_error # print the first show_errors errors if errors: print("{} error(s) occured during job status query of task {}:".format( len(errors), task.task_id)) tmpl = " {}" for i, err in enumerate(errors): print(tmpl.format(err)) if i + 1 >= self.show_errors: remaining = len(errors) - self.show_errors if remaining > 0: print(" ... and {} more".format(remaining)) break n_poll_fails += 1 if task.poll_fails > 0 and n_poll_fails > task.poll_fails: raise Exception("poll_fails exceeded") else: continue else: n_poll_fails = 0 # states stores job_id's as keys, so replace them by using job_num's # from active_jobs (which was used for the list of jobs to query in the first place) states_by_num = OrderedDict() for job_num, data in six.iteritems(active_jobs): job_id = data["job_id"] states_by_num[job_num] = self.status_data_cls.job_data(**states_by_id[job_id]) # consider jobs with unknown ids as retry jobs for job_num, data in six.iteritems(unknown_jobs): states_by_num[job_num] = self.status_data_cls.job_data( status=self.job_manager.RETRY, error="unknown job id") # store jobs per status and take further actions depending on the status pending_jobs = OrderedDict() running_jobs = OrderedDict() newly_failed_jobs = OrderedDict() retry_jobs = OrderedDict() for job_num, data in six.iteritems(states_by_num): if data["status"] == self.job_manager.PENDING: pending_jobs[job_num] = data task.forward_dashboard_event(self.dashboard, data, "status.pending", job_num) elif data["status"] == self.job_manager.RUNNING: running_jobs[job_num] = data task.forward_dashboard_event(self.dashboard, data, "status.running", job_num) elif data["status"] == self.job_manager.FINISHED: finished_jobs[job_num] = data self.poll_data.n_active -= 1 self.submission_data.jobs[job_num]["job_id"] = self.submission_data.dummy_job_id task.forward_dashboard_event(self.dashboard, data, "status.finished", job_num) elif data["status"] in (self.job_manager.FAILED, self.job_manager.RETRY): newly_failed_jobs[job_num] = data self.poll_data.n_active -= 1 # retry or ultimately failed? if self.job_retries[job_num] < task.retries: self.job_retries[job_num] += 1 self.submission_data.attempts.setdefault(job_num, 0) self.submission_data.attempts[job_num] += 1 data["status"] = self.job_manager.RETRY retry_jobs[job_num] = self.submission_data.jobs[job_num]["branches"] task.forward_dashboard_event(self.dashboard, data, "status.retry", job_num) else: failed_jobs[job_num] = data task.forward_dashboard_event(self.dashboard, data, "status.failed", job_num) else: raise Exception("unknown job status '{}'".format(data["status"])) # gather some counts n_pending = len(pending_jobs) n_running = len(running_jobs) n_finished = len(finished_jobs) n_retry = len(retry_jobs) n_failed = len(failed_jobs) n_unsubmitted = len(self.submission_data.unsubmitted_jobs) # log the status line counts = (n_pending, n_running, n_finished, n_retry, n_failed) if self.poll_data.n_parallel != self.n_parallel_max: counts = (n_unsubmitted,) + counts status_line = self.job_manager.status_line(counts, last_counts=True, sum_counts=n_jobs, color=True, align=task.align_polling_status_line) status_line = task.modify_polling_status_line(status_line) task.publish_message(status_line) self.last_status_counts = counts # inform the scheduler about the progress task.publish_progress(100. * n_finished / n_jobs) # log newly failed jobs if newly_failed_jobs: print("{} failed job(s) in task {}:".format(len(newly_failed_jobs), task.task_id)) tmpl = " job: {job_num}, branch(es): {branches}, id: {job_id}, " \ "status: {status}, code: {code}, error: {error}{ext}" for i, (job_num, data) in enumerate(six.iteritems(newly_failed_jobs)): branches = self.submission_data.jobs[job_num]["branches"] log_file = self.submission_data.jobs[job_num]["log_file"] ext = "" if data["code"] in self.job_error_messages: law_err = self.job_error_messages[data["code"]] ext += ", job script error: {}".format(law_err) if log_file: ext += ", log: {}".format(log_file) print(tmpl.format(job_num=job_num, branches=",".join(str(b) for b in branches), ext=ext, **data)) if i + 1 >= self.show_errors: remaining = len(newly_failed_jobs) - self.show_errors if remaining > 0: print(" ... and {} more".format(remaining)) break # infer the overall status reached_end = n_jobs == n_finished + n_failed finished = n_finished >= self.poll_data.n_finished_min failed = n_failed > self.poll_data.n_failed_max unreachable = n_jobs - n_failed < self.poll_data.n_finished_min if finished: # write status output if "status" in self._outputs: status_data = self.status_data_cls() status_data.jobs.update(finished_jobs) status_data.jobs.update(states_by_num) self._outputs["status"].dump(status_data, formatter="json", indent=4) break elif failed: failed_nums = [job_num for job_num in failed_jobs if job_num not in retry_jobs] raise Exception("tolerance exceeded for jobs {}".format(failed_nums)) elif unreachable: err = None if reached_end: err = "acceptance of {} not reached, total jobs: {}, failed jobs: {}" elif task.check_unreachable_acceptance: err = "acceptance of {} unreachable, total jobs: {}, failed jobs: {}" if err: raise Exception(err.format(self.poll_data.n_finished_min, n_jobs, n_failed)) # configurable poll callback task.poll_callback(self.poll_data) # trigger automatic resubmission and submission of unsubmitted jobs if necessary if retry_jobs or self.poll_data.n_active < self.poll_data.n_parallel: self.submit(retry_jobs) # break when no polling is desired # we can get to this point when there was already a submission and the no_poll # parameter was set so that only failed jobs are resubmitted once if task.no_poll: break duration = round(time.time() - start_time) task.publish_message("polling took {}".format(human_duration(seconds=duration)))
def handle_scheduler_message(self, msg, _attr_value=None): """ handle_scheduler_message(msg) Hook that is called when a scheduler message *msg* is received. Returns *True* when the messages was handled, and *False* otherwise. Handled messages in addition to those defined in :py:meth:`law.workflow.base.BaseWorkflow.handle_scheduler_message`: - ``parallel_jobs = <int>`` - ``walltime = <str/int/float>`` - ``poll_fails = <int>`` - ``poll_interval = <str/int/float>`` - ``retries = <int>`` """ attr, value = _attr_value or (None, None) # handle "parallel_jobs" if attr is None: m = re.match(r"^\s*(parallel\_jobs)\s*(\=|\:)\s*(.*)\s*$", str(msg)) if m: attr = "parallel_jobs" # the workflow proxy must be set here if not getattr(self, "workflow_proxy", None): value = Exception("workflow_proxy not set yet") else: try: n = self.workflow_proxy._set_parallel_jobs(int(m.group(3))) value = "unlimited" if n == self.workflow_proxy.n_parallel_max else str(n) except ValueError as e: value = e # handle "walltime" if attr is None: m = re.match(r"^\s*(walltime)\s*(\=|\:)\s*(.*)\s*$", str(msg)) if m: attr = "walltime" try: self.walltime = self.__class__.walltime.parse(m.group(3)) value = human_duration(hours=self.walltime, colon_format=True) except ValueError as e: value = e # handle "poll_fails" if attr is None: m = re.match(r"^\s*(poll\_fails)\s*(\=|\:)\s*(.*)\s*$", str(msg)) if m: attr = "poll_fails" try: self.poll_fails = int(m.group(3)) value = self.poll_fails except ValueError as e: value = e # handle "poll_interval" if attr is None: m = re.match(r"^\s*(poll\_interval)\s*(\=|\:)\s*(.*)\s*$", str(msg)) if m: attr = "poll_interval" try: self.poll_interval = self.__class__.poll_interval.parse(m.group(3)) value = human_duration(minutes=self.poll_interval, colon_format=True) except ValueError as e: value = e # handle "retries" if attr is None: m = re.match(r"^\s*(retries)\s*(\=|\:)\s*(.*)\s*$", str(msg)) if m: attr = "retries" try: self.retries = int(m.group(3)) value = self.retries except ValueError as e: value = e return super(BaseRemoteWorkflow, self).handle_scheduler_message(msg, (attr, value))