def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', None) itask.summary['submit_method_id'] = None self.pflag = True if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. # See github #476. if itask.state.reset(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED) elif itask.state.reset(TASK_STATUS_SUBMIT_RETRYING, ): # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_RETRYING) self._reset_job_timers(itask)
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info( CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value })
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [ TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED ]: msg += "\n " + output if msg: LOG.info("[%s] -Succeeded with outputs not completed: %s", itask, msg) if itask.state.reset(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self.job_pool.set_job_state(job_d, TASK_STATUS_SUCCEEDED) self._reset_job_timers(itask)
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = { TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED } if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state(*pollable_statuses): to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd(self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def remove_empty_parents( path: Union[Path, str], tail: Union[Path, str] ) -> None: """Work our way up the tail of path, removing empty dirs only. Args: path: Absolute path to the directory, e.g. /foo/bar/a/b/c tail: The tail of the path to work our way up, e.g. a/b/c Example: remove_empty_parents('/foo/bar/a/b/c', 'a/b/c') would remove /foo/bar/a/b (assuming it's empty), then /foo/bar/a (assuming it's empty). """ path = Path(path) if not path.is_absolute(): raise ValueError('path must be absolute') tail = Path(tail) if tail.is_absolute(): raise ValueError('tail must not be an absolute path') if not str(path).endswith(str(tail)): raise ValueError(f"path '{path}' does not end with '{tail}'") depth = len(tail.parts) - 1 for i in range(depth): parent = path.parents[i] if not parent.is_dir(): continue try: parent.rmdir() LOG.info(f'Removing directory: {parent}') except OSError: break
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_time(job_d, 'finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) self.job_pool.set_job_state(job_d, TASK_STATUS_FAILED) LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num, "failed") elif itask.state.reset(TASK_STATUS_RETRYING): delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) if itask.state.is_held: delay_msg = "held (%s)" % delay_msg msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers(itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def remote_clean(reg, platform_names, timeout): """Run subprocesses to clean workflows on remote install targets (skip localhost), given a set of platform names to look up. Args: reg (str): Workflow name. platform_names (list): List of platform names to look up in the global config, in order to determine the install targets to clean on. timeout (str): Number of seconds to wait before cancelling. """ try: install_targets_map = ( get_install_target_to_platforms_map(platform_names)) except PlatformLookupError as exc: raise PlatformLookupError( "Cannot clean on remote platforms as the workflow database is " f"out of date/inconsistent with the global config - {exc}") pool = [] for target, platforms in install_targets_map.items(): if target == get_localhost_install_target(): continue shuffle(platforms) LOG.info( f"Cleaning on install target: {platforms[0]['install target']}") # Issue ssh command: pool.append( (_remote_clean_cmd(reg, platforms[0], timeout), target, platforms) ) failed_targets = [] # Handle subproc pool results almost concurrently: while pool: for proc, target, platforms in pool: ret_code = proc.poll() if ret_code is None: # proc still running continue pool.remove((proc, target, platforms)) out, err = (f.decode() for f in proc.communicate()) if out: LOG.debug(out) if ret_code: # Try again using the next platform for this install target: this_platform = platforms.pop(0) excn = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, this_platform['name'], " ".join(proc.args), ret_code, out, err) LOG.debug(excn) if platforms: pool.append( (_remote_clean_cmd(reg, platforms[0], timeout), target, platforms) ) else: # Exhausted list of platforms failed_targets.append(target) elif err: LOG.debug(err) time.sleep(0.2) if failed_targets: raise CylcError( f"Could not clean on install targets: {', '.join(failed_targets)}")
def rsync_255_fail(ctx, platform=None) -> bool: """Tests context for rsync failing to communicate with a host. If there has been a failure caused by rsync being unable to connect try a test of ssh connectivity. Necessary becuase loss of connectivity may cause different rsync failures depending on version, and some of the failures may be caused by other problems. """ rsync_255_fail = False platform_rsync_cmd = (platform['rsync command'] if platform is not None else 'rsync') rsync_cmd = shlex.split(platform_rsync_cmd) if (ctx.cmd[0] == rsync_cmd[0] and ctx.ret_code not in [0, 255] and is_remote_host(ctx.host)): ssh_cmd = (platform['ssh command'] if platform is not None else 'ssh') ssh_test_cmd = shlex.split(f'{ssh_cmd} {ctx.host} true') LOG.info( f'testing connectivity for {ctx.host} using {ssh_test_cmd}') ssh_test = run(shlex.split(f'{ssh_cmd} {ctx.host} true'), capture_output=True) if ssh_test.returncode == 255: rsync_255_fail = True elif (ctx.cmd[0] == rsync_cmd[0] and ctx.ret_code == 255): rsync_255_fail = True return rsync_255_fail
def clear_broadcast(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options(self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info(get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) if modified_settings: self.data_store_mgr.delta_broadcast() return modified_settings, bad_options
def _run_event_handlers_callback(proc_ctx): """Callback on completion of a workflow event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) else: LOG.info(str(proc_ctx))
def insert_db_job(self, row_idx, row): """Load job element from DB post restart.""" if row_idx == 0: LOG.info("LOADING job data") (point_string, name, status, submit_num, time_submit, time_run, time_run_exit, batch_sys_name, batch_sys_job_id, platform_name) = row if status not in JOB_STATUS_SET: return t_id = f'{self.workflow_id}{ID_DELIM}{point_string}{ID_DELIM}{name}' j_id = f'{t_id}{ID_DELIM}{submit_num}' try: tdef = self.schd.config.get_taskdef(name) j_owner = self.schd.owner if platform_name: j_host = get_host_from_platform(get_platform(platform_name)) else: j_host = self.schd.host j_buf = PbJob( stamp=f'{j_id}@{time()}', id=j_id, submit_num=submit_num, state=status, task_proxy=t_id, submitted_time=time_submit, started_time=time_run, finished_time=time_run_exit, batch_sys_name=batch_sys_name, batch_sys_job_id=batch_sys_job_id, host=j_host, owner=j_owner, name=name, cycle_point=point_string, ) # Add in log files. j_buf.job_log_dir = get_task_job_log(self.schd.suite, point_string, name, submit_num) overrides = self.schd.task_events_mgr.broadcast_mgr.get_broadcast( TaskID.get(name, point_string)) if overrides: rtconfig = pdeepcopy(tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = tdef.rtconfig j_buf.extra_logs.extend([ os.path.expanduser(os.path.expandvars(log_file)) for log_file in rtconfig['extra log files'] ]) except SuiteConfigError: LOG.exception( ('ignoring job %s from the suite run database\n' '(its task definition has probably been deleted).') % j_id) except Exception: LOG.exception('could not load job %s' % j_id) else: self.added[j_id] = j_buf self.task_jobs.setdefault(t_id, set()).add(j_id) self.updates_pending = True
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return modified_settings, bad_options
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') # Hardcoded, for new - but much of this functionality can be # removed more swingingly. LOG.info('[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def state_reset(self, status=None, is_held=None, is_queued=None, is_runahead=None) -> bool: """Set new state and log the change. Return whether it changed.""" before = str(self) if self.state.reset(status, is_held, is_queued, is_runahead): LOG.info(f"[{before}] => {self.state}") return True return False
async def auto_restart(scheduler, _): """Automatically restart the workflow if configured to do so.""" current_glbl_cfg = glbl_cfg(cached=False) mode = _should_auto_restart(scheduler, current_glbl_cfg) if mode: LOG.info('The Cylc workflow host will soon become un-available.') _set_auto_restart(scheduler, restart_delay=current_glbl_cfg.get( ['scheduler', 'auto restart delay']), mode=mode)
def scheduler_cli(parser, options, args, is_restart=False): """Implement cylc (run|restart). This function should contain all of the command line facing functionality of the Scheduler, exit codes, logging, etc. The Scheduler itself should be a Python object you can import and run in a regular Python session so cannot contain this kind of functionality. """ reg = args[0] # Check suite is not already running before start of host selection. try: suite_files.detect_old_contact_file(reg) except SuiteServiceFileError as exc: sys.exit(exc) _check_registration(reg) # re-execute on another host if required _distribute(options.host, is_restart) # print the start message if options.no_detach or options.format == 'plain': _start_print_blurb() # setup the scheduler # NOTE: asyncio.run opens an event loop, runs your coro, # then shutdown async generators and closes the event loop scheduler = Scheduler(reg, options, is_restart=is_restart) asyncio.run(_setup(parser, options, reg, is_restart, scheduler)) # daemonize if requested # NOTE: asyncio event loops cannot persist across daemonization # ensure you have tidied up all threads etc before daemonizing if not options.no_detach: from cylc.flow.daemonize import daemonize daemonize(scheduler) # setup loggers _open_logs(reg, options.no_detach) # run the workflow ret = asyncio.run(_run(parser, options, reg, is_restart, scheduler)) # exit # NOTE: we must clean up all asyncio / threading stuff before exiting # NOTE: any threads which include sleep statements could cause # sys.exit to hang if not shutdown properly LOG.info("DONE") _close_logs() sys.exit(ret)
def test_value_error_raises_system_exit( self, mocked_glbl_cfg, mocked_get_suite_run_log_name, ): """Test that a ValueError when writing to a log stream won't result in multiple exceptions (what could lead to infinite loop in some occasions. Instead, it **must** raise a SystemExit""" with tempfile.NamedTemporaryFile() as tf: # mock objects used when creating the file handler mocked = mock.MagicMock() mocked_glbl_cfg.return_value = mocked mocked.get.return_value = 100 mocked_get_suite_run_log_name.return_value = tf.name file_handler = TimestampRotatingFileHandler("suiteA", False) # next line is important as pytest can have a "Bad file descriptor" # due to a FileHandler with default "a" (pytest tries to r/w). file_handler.mode = "a+" # enable the logger LOG.setLevel(logging.INFO) LOG.addHandler(file_handler) # Disable raising uncaught exceptions in logging, due to file # handler using stdin.fileno. See the following links for more. # https://github.com/pytest-dev/pytest/issues/2276 & # https://github.com/pytest-dev/pytest/issues/1585 logging.raiseExceptions = False # first message will initialize the stream and the handler LOG.info("What could go") # here we change the stream of the handler old_stream = file_handler.stream file_handler.stream = mock.MagicMock() file_handler.stream.seek = mock.MagicMock() # in case where file_handler.stream.seek.side_effect = ValueError try: # next call will call the emit method and use the mocked stream LOG.info("wrong?!") self.fail("Exception SystemError was not raised") except SystemExit: pass finally: # clean up file_handler.stream = old_stream # for log_handler in LOG.handlers: # log_handler.close() file_handler.close() LOG.removeHandler(file_handler) logging.raiseExceptions = True
def poll_task_jobs(self, suite, itasks, msg=None): """Poll jobs of specified tasks. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ if itasks: if msg is not None: LOG.info(msg) self._run_job_cmd(self.JOBS_POLL, suite, itasks, self._poll_task_jobs_callback)
def satisfy_xclock(self, itask: TaskProxy): """Attempt to satisfy itask's clock trigger, if it has one. Args: itask (TaskProxy): TaskProxy """ label, sig, ctx, satisfied = self._get_xclock(itask) if satisfied: return if wall_clock(*ctx.func_args, **ctx.func_kwargs): itask.state.xclock = (label, True) self.sat_xclock.append(sig) LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
def load_xtrigger_for_restart(self, row_idx: int, row: Tuple[str, str]): """Load satisfied xtrigger results from suite DB. Args: row_idx (int): row index (used for logging) row (Tuple[str, str]): tuple with the signature and results (json) Raises: ValueError: if the row cannot be parsed as JSON """ if row_idx == 0: LOG.info("LOADING satisfied xtriggers") sig, results = row self.sat_xtrig[sig] = json.loads(results)
def put_broadcast(self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"script": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} addict(self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces if modified_settings: self.data_store_mgr.delta_broadcast() return modified_settings, bad_options
def call_xtriggers_async(self, itask: TaskProxy): """Call itask's xtrigger functions via the process pool... ...if previous call not still in-process and retry period is up. Args: itask: task proxy to check. """ for label, sig, ctx, _ in self._get_xtrigs(itask, unsat_only=True): if sig.startswith("wall_clock"): # Special case: quick synchronous clock check. if 'absolute_as_seconds' not in ctx.func_kwargs: ctx.func_kwargs.update( { 'point_as_seconds': itask.get_point_as_seconds() } ) if wall_clock(*ctx.func_args, **ctx.func_kwargs): itask.state.xtriggers[label] = True self.sat_xtrig[sig] = {} self.data_store_mgr.delta_task_xtrigger(sig, True) LOG.info('xtrigger satisfied: %s = %s', label, sig) continue # General case: potentially slow asynchronous function call. if sig in self.sat_xtrig: if not itask.state.xtriggers[label]: itask.state.xtriggers[label] = True res = {} for key, val in self.sat_xtrig[sig].items(): res["%s_%s" % (label, key)] = val if res: xtrigger_env = [{'environment': {key: val}} for key, val in res.items()] self.broadcast_mgr.put_broadcast( [str(itask.point)], [itask.tdef.name], xtrigger_env ) continue if sig in self.active: # Already waiting on this result. continue now = time() if sig in self.t_next_call and now < self.t_next_call[sig]: # Too soon to call this one again. continue self.t_next_call[sig] = now + ctx.intvl # Queue to the process pool, and record as active. self.active.append(sig) self.proc_pool.put_command(ctx, self.callback)
def remove_dir_and_target(path: Union[Path, str]) -> None: """Delete a directory tree (i.e. including contents), as well as the target directory tree if the specified path is a symlink. Args: path: the absolute path of the directory to delete. """ if not os.path.isabs(path): raise ValueError('Path must be absolute') if os.path.exists(path) and not os.path.isdir(path): raise NotADirectoryError(path) if os.path.islink(path): if os.path.exists(path): target = os.path.realpath(path) LOG.info( f'Removing symlink target directory: ({path} ->) {target}' ) rmtree(target, onerror=handle_rmtree_err) LOG.info(f'Removing symlink: {path}') else: LOG.info(f'Removing broken symlink: {path}') os.remove(path) elif not os.path.exists(path): raise FileNotFoundError(path) else: LOG.info(f'Removing directory: {path}') rmtree(path, onerror=handle_rmtree_err)
def main(parser, options, suite, event_msg, event_id): suite = os.path.normpath(suite) LOG.info('Send to suite %s: "%s" (%s)', suite, event_msg, event_id) pclient = get_client(suite, timeout=options.comms_timeout) max_n_tries = int(options.max_n_tries) retry_intvl_secs = float(options.retry_intvl_secs) mutation_kwargs = { 'request_string': MUTATION, 'variables': { 'wFlows': [suite], 'eventMsg': event_msg, 'eventId': event_id, } } for i_try in range(max_n_tries): try: pclient('graphql', mutation_kwargs) except ClientError as exc: LOG.exception(exc) LOG.info(MSG_SEND_FAILED, i_try + 1, max_n_tries) if i_try == max_n_tries - 1: # final attempt raise CylcError('send failed') LOG.info(MSG_SEND_RETRY, retry_intvl_secs, options.comms_timeout) sleep(retry_intvl_secs) else: if i_try > 0: LOG.info(MSG_SEND_SUCCEED, i_try + 1, max_n_tries) break
def main(parser, options, suite, event_msg, event_id): LOG.info('Send to suite %s: "%s" (%s)', suite, event_msg, event_id) pclient = SuiteRuntimeClient(suite, options.owner, options.host, options.port) max_n_tries = int(options.max_n_tries) retry_intvl_secs = float(options.retry_intvl_secs) for i_try in range(max_n_tries): try: pclient('put_ext_trigger', { 'message': event_msg, 'id': event_id }, timeout=options.comms_timeout) except ClientError as exc: LOG.exception(exc) LOG.info(MSG_SEND_FAILED, i_try + 1, max_n_tries) if i_try == max_n_tries - 1: # final attempt raise CylcError('send failed') else: LOG.info(MSG_SEND_RETRY, retry_intvl_secs, options.comms_timeout) sleep(retry_intvl_secs) else: if i_try > 0: LOG.info(MSG_SEND_SUCCEED, i_try + 1, max_n_tries) break
def main(parser: COP, options: 'Values', workflow: str, event_msg: str, event_id: str) -> None: workflow, _ = parse_reg(workflow) LOG.info('Send to workflow %s: "%s" (%s)', workflow, event_msg, event_id) pclient = get_client(workflow, timeout=options.comms_timeout) max_n_tries = int(options.max_n_tries) retry_intvl_secs = float(options.retry_intvl_secs) mutation_kwargs = { 'request_string': MUTATION, 'variables': { 'wFlows': [workflow], 'eventMsg': event_msg, 'eventId': event_id, } } for i_try in range(max_n_tries): try: pclient('graphql', mutation_kwargs) except ClientError as exc: LOG.exception(exc) LOG.info(MSG_SEND_FAILED, i_try + 1, max_n_tries) if i_try == max_n_tries - 1: # final attempt raise CylcError('send failed') LOG.info(MSG_SEND_RETRY, retry_intvl_secs, options.comms_timeout) sleep(retry_intvl_secs) else: if i_try > 0: LOG.info(MSG_SEND_SUCCEED, i_try + 1, max_n_tries) break
def test_value_error_raises_system_exit(self, mocked_glbl_cfg): """Test that a ValueError when writing to a log stream won't result in multiple exceptions (what could lead to infinite loop in some occasions. Instead, it **must** raise a SystemExit""" with tempfile.NamedTemporaryFile() as tf: # mock objects used when creating the file handler mocked = mock.MagicMock() mocked_glbl_cfg.return_value = mocked mocked.get_derived_host_item.return_value = tf.name mocked.get.return_value = 100 file_handler = TimestampRotatingFileHandler("suiteA", False) # next line is important as pytest can have a "Bad file descriptor" # due to a FileHandler with default "a" (pytest tries to r/w). file_handler.mode = "a+" # enable the logger LOG.setLevel(logging.INFO) LOG.addHandler(file_handler) # Disable raising uncaught exceptions in logging, due to file # handler using stdin.fileno. See the following links for more. # https://github.com/pytest-dev/pytest/issues/2276 & # https://github.com/pytest-dev/pytest/issues/1585 logging.raiseExceptions = False # first message will initialize the stream and the handler LOG.info("What could go") # here we change the stream of the handler old_stream = file_handler.stream file_handler.stream = mock.MagicMock() file_handler.stream.seek = mock.MagicMock() # in case where file_handler.stream.seek.side_effect = ValueError try: # next call will call the emit method and use the mocked stream LOG.info("wrong?!") self.fail("Exception SystemError was not raised") except SystemExit: pass finally: # clean up file_handler.stream = old_stream # for log_handler in LOG.handlers: # log_handler.close() file_handler.close() LOG.removeHandler(file_handler) logging.raiseExceptions = True
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self._get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info( '[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def run_reftest(config, ctx): """Run reference test at shutdown.""" reffilename = config.get_ref_log_name() curfilename = get_workflow_test_log_name(ctx.workflow) ref = _load_reflog(reffilename) cur = _load_reflog(curfilename) if ref == cur: LOG.info('WORKFLOW REFERENCE TEST PASSED') else: exc = WorkflowEventError( 'WORKFLOW REFERENCE TEST FAILED\n' 'triggering is NOT consistent with the reference log:\n%s\n' % '\n'.join(unified_diff(ref, cur, 'reference', 'this run'))) LOG.exception(exc) raise exc
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self._get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info('[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info('[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def put_broadcast( self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"script": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def upgrade_retry_state(self): """Replace the retry state with xtriggers. * Change *retrying tasks to waiting * Add the required xtrigger Note: The retry status can be safely removed as this is really a display state, the retry logic revolves around the TaskActionTimer. From: cylc<8 To: cylc>=8 PR: #3423 Returns: list - (cycle, name, status) tuples of all retrying tasks. """ conn = self.connect() for table in [self.TABLE_TASK_POOL_CHECKPOINTS, self.TABLE_TASK_POOL]: tasks = list( conn.execute(rf''' SELECT cycle, name, status FROM {table} WHERE status IN ('retrying', 'submit-retrying') ''')) if tasks: LOG.info(f'Upgrade retrying tasks in table {table}') conn.executemany( rf''' UPDATE {table} SET status='{TASK_STATUS_WAITING}' WHERE cycle==? and name==? and status==? ''', tasks) conn.commit() return tasks
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info( '[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def verify_triggering(self): new = self.get_triggered(self.new_loglines) ref = self.get_triggered(self.ref_loglines) if len(new) == 0: raise LogAnalyserError( "new log contains no triggering info.") if len(ref) == 0: raise LogAnalyserError( "reference log contains no triggering info.") new.sort() ref.sort() if new != ref: diff = unified_diff(new, ref, 'this run', 'reference log') raise LogAnalyserError( "triggering is NOT consistent with the reference log:" + '\n' + '\n'.join(diff) + '\n') else: LOG.info( "LogAnalyser: triggering is consistent with the reference log")
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info(CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value})
def _run_event_mail_callback(proc_ctx): """Callback the mail command for notification of a suite event.""" if proc_ctx.ret_code: LOG.warning(str(proc_ctx)) else: LOG.info(str(proc_ctx))
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%02d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks