def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning(f"{self.pub_dao.db_file_name}: recovered from " f"{self.pri_dao.db_file_name}") self.pub_dao.n_tries = 0
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ # Filter out CYLC_TEMPLATE_VARS which breaks executemany because it's: # - a dict # - recursive (contains itself!) if stmt_args_list and stmt_args_list[0]: stmt_args_list = [ i for i in stmt_args_list if i[0] != 'CYLC_TEMPLATE_VARS' ] try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flow.flags.verbosity > 1: traceback.print_exc() err_log = ( "cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s" ) % {"file": self.db_file_name, "stmt": stmt} for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args}) LOG.warning(err_log) raise
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str: # Explicit config file override. fname = os.path.join(conf_path_str, self.CONF_BASENAME) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) elif conf_path_str is None: # Use default locations. for conf_dir, conf_type in [ (self.SITE_CONF_DIR, upgrader.SITE_CONFIG), (self.USER_CONF_DIR, upgrader.USER_CONFIG) ]: fname = os.path.join(conf_dir, self.CONF_BASENAME) if not os.access(fname, os.F_OK | os.R_OK): continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning('ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise # (OK if no flow.rc is found, just use system defaults). self._transform()
def check_job_time(self, itask, now): """Check/handle job timeout and poll timer""" can_poll = self.check_poll_time(itask, now) if itask.timeout is None or now <= itask.timeout: return can_poll # Timeout reached for task, emit event and reset itask.timeout if itask.state(TASK_STATUS_RUNNING): time_ref = itask.summary['started_time'] event = 'execution timeout' elif itask.state(TASK_STATUS_SUBMITTED): time_ref = itask.summary['submitted_time'] event = 'submission timeout' msg = event try: msg += ' after %s' % intvl_as_str(itask.timeout - time_ref) except (TypeError, ValueError): # Badness in time_ref? pass itask.timeout = None # emit event only once if msg and event: LOG.warning('[%s] -%s', itask, msg) self.setup_event_handlers(itask, event, msg) return True else: return can_poll
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str: # Explicit config file override. fname = os.path.join(conf_path_str, self.CONF_BASENAME) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) elif conf_path_str is None: # Use default locations. for conf_type, conf_dir in self.CONF_DIR_HIERARCHY: fname = os.path.join(conf_dir, self.CONF_BASENAME) if not os.access(fname, os.F_OK | os.R_OK): continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning( f'ignoring bad {conf_type} {fname}:\n{exc}') else: # Abort on bad user file (users can fix it). LOG.error(f'bad {conf_type} {fname}') raise self._set_default_editors()
def _remove_bad_hosts(self, mock_host_stats=None): """Return dictionary of 'good' hosts with their metric stats. Run 'get-host-metrics' on each run host in parallel & store extracted stats for hosts, else an empty JSON structure. Filter out 'bad' hosts whereby either metric data cannot be accessed from the command or at least one metric value does not pass a specified threshold. """ if mock_host_stats: # Create fake data for unittest purposes (only). host_stats = dict(mock_host_stats) # Prevent mutable object issues else: if not self.hosts: return {} host_stats = self._get_host_metrics() # Analyse get-host-metrics results for host, data in list(dict(host_stats).items()): if not data: # No results for host (command failed) -> skip. host_stats.pop(host) continue for measure, cutoff in self.parsed_thresholds.items(): datum = data[measure] # Cutoff is a minimum or maximum depending on measure context. if ((datum > cutoff and measure.startswith("load")) or (datum < cutoff and (measure == "memory" or measure.startswith("disk-space")))): # Alert user that threshold has not been met. LOG.warning( "host '%s' did not pass %s threshold " + "(%s %s threshold %s)\n", host, measure, datum, ">" if measure.startswith("load") else "<", cutoff) host_stats.pop(host) break return host_stats
def merge_template_vars(native_tvars: Dict[str, Any], plugin_result: Dict[str, Any]) -> Dict[str, Any]: """Manage the merger of Cylc Native and Plugin template variables. Args: native_tvars: Template variables set on the Cylc command line using ``-s`` or a template variable file. plugin_result: Plugin result which should contain _at least_ "templating_detected" and "template_variable" keys. Returns: template_variables. Strategy: template variables set in a Cylc Native way should override the results of plugins. Examples: >>> a = {'FOO': 42, 'BAR': 'Hello World'} >>> tvars = {'FOO': 24, 'BAZ': 3.14159} >>> b = {'templating_detected': 'any', 'template_variables': tvars} >>> merge_template_vars(a, b) {'FOO': 42, 'BAZ': 3.14159, 'BAR': 'Hello World'} """ if plugin_result['templating_detected'] is not None: plugin_tvars = plugin_result['template_variables'] will_be_overwritten = (native_tvars.keys() & plugin_tvars.keys()) for key in will_be_overwritten: if plugin_tvars[key] != native_tvars[key]: LOG.warning(f'Overriding {key}: {plugin_tvars[key]} ->' f' {native_tvars[key]}') plugin_tvars.update(native_tvars) return plugin_tvars else: return native_tvars
def _process_message_check( self, itask, severity, message, event_time, flag, submit_num, ): """Helper for `.process_message`. See `.process_message` for argument list Check whether to process/skip message. Return True if `.process_message` should contine, False otherwise. """ logfmt = r'[%s] status=%s: %s%s at %s for job(%02d)' if flag == self.FLAG_RECEIVED and submit_num != itask.submit_num: # Ignore received messages from old jobs LOG.warning(logfmt + r' != current job(%02d)', itask, itask.state, self.FLAG_RECEIVED_IGNORED, message, event_time, submit_num, itask.submit_num) return False if itask.state.status in (TASK_STATUS_SUBMIT_RETRYING, TASK_STATUS_RETRYING): # Ignore polled messages if task is already in retrying statuses LOG.warning(logfmt, itask, itask.state, self.FLAG_POLLED_IGNORED, message, event_time, submit_num) return False LOG.log(self.LEVELS.get(severity, INFO), logfmt, itask, itask.state, flag, message, event_time, submit_num) return True
def deprecation_warnings(config_tree): """Check for deprecated items in config. Logs a warning for deprecated items: - "root-dir" - "jinja2:suite.rc" - "empy:suite.rc" """ deprecations = { 'empy:suite.rc': ( "'empy:suite.rc' is deprecated." " Use [template variables] instead."), 'jinja2:suite.rc': ( "'jinja2:suite.rc' is deprecated." " Use [template variables] instead."), 'root-dir': ( 'You have set "root-dir", which is not supported at ' 'Cylc 8. Use `[install] symlink dirs` in global.cylc ' 'instead.') } for string in list(config_tree.node): for deprecation in deprecations.keys(): if deprecation in string: LOG.warning(deprecations[deprecation])
def parse_suite_arg(options, arg): """From CLI arg "SUITE", return suite name and flow.cylc path. If arg is a registered suite, suite name is the registered name. If arg is a directory, suite name is the base name of the directory. If arg is a file, suite name is the base name of its container directory. """ if arg == '.': arg = os.getcwd() try: path = get_flow_file(arg, options.suite_owner) name = arg except SuiteServiceFileError: arg = os.path.abspath(arg) if os.path.isdir(arg): path = os.path.join(arg, SuiteFiles.FLOW_FILE) name = os.path.basename(arg) if not os.path.exists(path): # Probably using deprecated suite.rc path = os.path.join(arg, SuiteFiles.SUITE_RC) if not os.path.exists(path): raise SuiteServiceFileError( f'no flow.cylc or suite.rc in {arg}') else: LOG.warning( f'The filename "{SuiteFiles.SUITE_RC}" is deprecated ' f'in favor of "{SuiteFiles.FLOW_FILE}".') else: path = arg name = os.path.basename(os.path.dirname(arg)) return name, path
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flow.flags.debug: traceback.print_exc() err_log = ("cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s") % { "file": self.db_file_name, "stmt": stmt } for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args }) LOG.warning(err_log) raise
def _job_cmd_out_callback(suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: owner_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write((owner_at_host + line).encode()) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning("[%s] -%s%s", itask, owner_at_host, line)
def _job_cmd_out_callback(suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: owner_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = get_task_job_activity_log(suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write((owner_at_host + line).encode()) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning("[%s] -%s%s", itask, owner_at_host, line)
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def _get_metrics(hosts, metrics, data=None): """Retrieve host metrics using SSH if necessary. Note hosts will not appear in the returned results if: * They are not contactable. * There is an error in the command which returns the results. Args: hosts (list): List of host fqdns. metrics (list): List in the form [(function, arg1, arg2, ...), ...] data (dict): Used for logging success/fail outcomes of the form {host: {}} Examples: Command failure: >>> _get_metrics(['localhost'], [['elephant']]) ({}, {'localhost': {'get_metrics': 'Command failed (exit: 1)'}}) Returns: dict - {host: {(function, arg1, arg2, ...): result}} """ host_stats = {} proc_map = {} if not data: data = {host: dict() for host in hosts} # Start up commands on hosts cmd = ['psutil'] kwargs = {'stdin_str': json.dumps(metrics), 'capture_process': True} for host in hosts: if is_remote_host(host): proc_map[host] = remote_cylc_cmd(cmd, host=host, **kwargs) else: proc_map[host] = run_cmd(['cylc'] + cmd, **kwargs) # Collect results from commands while proc_map: for host, proc in list(proc_map.copy().items()): if proc.poll() is None: continue del proc_map[host] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): # Command failed in verbose/debug mode LOG.warning('Could not evaluate "%s" (return code %d)\n%s', host, proc.returncode, err) data[host]['get_metrics'] = ( f'Command failed (exit: {proc.returncode})') else: host_stats[host] = dict( zip( metrics, # convert JSON dicts -> namedtuples _deserialise(metrics, parse_dirty_json(out)))) sleep(0.01) return host_stats, data
def timed_out(self) -> bool: """Return whether timed out yet.""" if self.timeout is not None and now() > self.timeout: LOG.warning(f"{self.name} timed out after {self.interval}") self.timeout = None return True else: return False
def _process_message_check( self, itask, severity, message, event_time, flag, submit_num, ): """Helper for `.process_message`. See `.process_message` for argument list Check whether to process/skip message. Return True if `.process_message` should contine, False otherwise. """ if self.timestamp: timestamp = " at %s " % event_time else: timestamp = "" logfmt = r'[%s] status=%s: %s%s%s for job(%02d) flow(%s)' if flag == self.FLAG_RECEIVED and submit_num != itask.submit_num: # Ignore received messages from old jobs LOG.warning( logfmt + r' != current job(%02d)', itask, itask.state, self.FLAG_RECEIVED_IGNORED, message, timestamp, submit_num, itask.flow_label, itask.submit_num) return False if ( itask.state(TASK_STATUS_WAITING) and ( ( # task has a submit-retry lined up TimerFlags.SUBMISSION_RETRY in itask.try_timers and itask.try_timers[ TimerFlags.SUBMISSION_RETRY].num > 0 ) or ( # task has an execution-retry lined up TimerFlags.EXECUTION_RETRY in itask.try_timers and itask.try_timers[ TimerFlags.EXECUTION_RETRY].num > 0 ) ) ): # Ignore polled messages if task has a retry lined up LOG.warning( logfmt, itask, itask.state, self.FLAG_POLLED_IGNORED, message, timestamp, submit_num, itask.flow_label) return False LOG.log( LOG_LEVELS.get(severity, INFO), logfmt, itask, itask.state, flag, message, timestamp, submit_num, itask.flow_label) return True
def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning( "%(pub_db_name)s: recovered from %(pri_db_name)s" % { "pub_db_name": self.pub_dao.db_file_name, "pri_db_name": self.pri_dao.db_file_name}) self.pub_dao.n_tries = 0
def _manip_task_jobs_callback(self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning('Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ("|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def _backup(tgt: Path) -> None: """Make a timestamped backup of a dir or file.""" tstamp = get_current_time_string(use_basic_format=True) backup = Path(tgt).parent / (tgt.name + f'.{tstamp}') LOG.warning('Replacing an existing cylc-tutorials folder which will' f' be copied to {backup}') # NOTE: shutil interfaces don't fully support Path objects at all # python versions shutil.move(str(tgt), str(backup))
def is_valid_point(self, point): """Return True if point is on-sequence and within bounds.""" for sequence in self.sequences: if sequence.is_valid(point): return True else: LOG.warning("%s%s, %s" % ( self.ERR_PREFIX_TASK_NOT_ON_SEQUENCE, self.name, point)) return False
def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning( "%(pub_db_name)s: recovered from %(pri_db_name)s" % { "pub_db_name": self.pub_dao.db_file_name, "pri_db_name": self.pri_dao.db_file_name}) self.pub_dao.n_tries = 0
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt(table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries }) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() table.insert_queue.clear() table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries }) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def remote_tidy(self): """Remove workflow contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on workflow shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for install_target, message in self.remote_init_map.items(): if message != REMOTE_FILE_INSTALL_DONE: continue if install_target == get_localhost_install_target(): continue platform = get_random_platform_for_install_target(install_target) platform_n = platform['name'] cmd = ['remote-tidy'] if cylc.flow.flags.verbosity > 1: cmd.append('--debug') cmd.append(install_target) cmd.append(get_remote_workflow_run_dir(self.workflow)) cmd = construct_ssh_cmd(cmd, platform, timeout='10s') LOG.debug("Removing authentication keys and contact file " f"from remote: \"{install_target}\"") procs[platform_n] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for platform_n, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[platform_n] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for platform_n, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, platform_n, ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def get_rose_vars(srcdir=None, opts=None): """Load template variables from Rose suite configuration. Loads the Rose suite configuration tree from the filesystem using the shell environment. Args: srcdir(pathlib.Path): Path to the Rose suite configuration (the directory containing the ``rose-suite.conf`` file). opts: Options object containing specification of optional configuarations set by the CLI. Returns: dict - A dictionary of sections of rose-suite.conf. For each section either a dictionary or None is returned. E.g. { 'env': {'MYVAR': 42}, 'empy:suite.rc': None, 'jinja2:suite.rc': { 'myJinja2Var': {'yes': 'it is a dictionary!'} } } """ # Set up blank page for returns. config = {'env': {}, 'template_variables': {}, 'templating_detected': None} # Return a blank config dict if srcdir does not exist if not rose_config_exists(srcdir, opts): if (getattr(opts, "opt_conf_keys", None) or getattr(opts, "defines", None) or getattr(opts, "rose_template_vars", None)): raise NotARoseSuiteException() return config # Load the raw config tree config_tree = rose_config_tree_loader(srcdir, opts) # Warn if root-dir set in config for string in list(config_tree.node): if 'root-dir' in string: LOG.warning('You have set "root-dir", which is not supported at ' 'Cylc 8. Use `[install] symlink dirs` in global.cylc ' 'instead.') break # Extract templatevars from the configuration get_rose_vars_from_config_node(config, config_tree.node, os.environ) # Export environment vars for key, val in config['env'].items(): os.environ[key] = val return config
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def upg(cfg, descr): """Upgrade old suite configuration.""" u = upgrader(cfg, descr) u.obsolete('6.1.3', ['visualization', 'enable live graph movie']) u.obsolete('7.2.2', ['cylc', 'dummy mode']) u.obsolete('7.2.2', ['cylc', 'simulation mode']) u.obsolete('7.2.2', ['runtime', '__MANY__', 'dummy mode']) u.obsolete('7.2.2', ['runtime', '__MANY__', 'simulation mode']) u.obsolete('7.6.0', ['runtime', '__MANY__', 'enable resurrection']) u.obsolete('7.8.0', ['runtime', '__MANY__', 'suite state polling', 'template']) u.obsolete('7.8.1', ['cylc', 'events', 'reset timer']) u.obsolete('7.8.1', ['cylc', 'events', 'reset inactivity timer']) u.obsolete('7.8.1', ['runtime', '__MANY__', 'events', 'reset timer']) u.obsolete('8.0.0', ['cylc', 'log resolved dependencies']) u.obsolete('8.0.0', ['cylc', 'reference test', 'allow task failures']) u.obsolete('8.0.0', ['cylc', 'reference test', 'live mode suite timeout']) u.obsolete('8.0.0', ['cylc', 'reference test', 'dummy mode suite timeout']) u.obsolete('8.0.0', ['cylc', 'reference test', 'dummy-local mode suite timeout']) u.obsolete('8.0.0', ['cylc', 'reference test', 'simulation mode suite timeout']) u.obsolete('8.0.0', ['cylc', 'reference test', 'required run mode']) u.obsolete('8.0.0', ['cylc', 'reference test', 'suite shutdown event handler']) u.deprecate('8.0.0', ['cylc', 'abort if any task fails'], ['cylc', 'events', 'abort if any task fails']) u.obsolete('8.0.0', ['runtime', '__MANY__', 'job', 'shell']) u.upgrade() # Upgrader cannot do this type of move. try: keys = set() cfg['scheduling'].setdefault('graph', {}) cfg['scheduling']['graph'].update( cfg['scheduling'].pop('dependencies')) graphdict = cfg['scheduling']['graph'] for key, value in graphdict.copy().items(): if isinstance(value, dict) and 'graph' in value: graphdict[key] = value['graph'] keys.add(key) if keys: LOG.warning( "deprecated graph items were automatically upgraded in '%s':", descr) LOG.warning( ' * (8.0.0) %s -> %s - for X in:\n%s', u.show_keys(['scheduling', 'dependencies', 'X', 'graph']), u.show_keys(['scheduling', 'graph', 'X']), '\n'.join(sorted(keys)), ) except KeyError: pass
def upgrade_legacy_ids(*ids: str) -> List[str]: """Reformat IDs from legacy to contemporary format: If no upgrading is required it returns the identifiers unchanged. Args: *ids (tuple): Identifier list. Returns: tuple/list - Identifier list. # do nothing to contemporary ids: >>> upgrade_legacy_ids('workflow') ['workflow'] >>> upgrade_legacy_ids('workflow', '//cycle') ['workflow', '//cycle'] # upgrade legacy task.cycle ids: >>> upgrade_legacy_ids('workflow', 'task.123', 'task.234') ['workflow', '//123/task', '//234/task'] # upgrade legacy cycle/task ids: >>> upgrade_legacy_ids('workflow', '123/task', '234/task') ['workflow', '//123/task', '//234/task'] # upgrade mixed legacy ids: >>> upgrade_legacy_ids('workflow', 'task.123', '234/task') ['workflow', '//123/task', '//234/task'] # upgrade legacy task states: >>> upgrade_legacy_ids('workflow', 'task.123:abc', '234/task:def') ['workflow', '//123/task:abc', '//234/task:def'] """ if len(ids) < 2: # only legacy relative references require upgrade => abort return list(ids) legacy_ids = [ids[0]] for id_ in ids[1:]: try: tokens = legacy_tokenise(id_) except ValueError: # not a valid legacy token => abort return list(ids) else: # upgrade this token legacy_ids.append(detokenise(tokens, selectors=True)) LOG.warning(f'Cylc7 format is deprecated, using: {" ".join(legacy_ids)}' ' (see "cylc help id")') return legacy_ids
def remote_tidy(self): """Remove suite contact files and keys from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for platform, init_with_contact in self.remote_init_map.items(): platform = get_platform(platform) host = get_host_from_platform(platform) owner = platform['owner'] self.install_target = get_install_target_from_platform(platform) if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['remote-tidy'] if cylc.flow.flags.debug: cmd.append('--debug') cmd.append(str(f'{self.install_target}')) cmd.append(get_remote_suite_run_dir(platform, self.suite)) if is_remote_platform(platform): cmd = construct_platform_ssh_cmd(cmd, platform, timeout='10s') else: cmd = ['cylc'] + cmd procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt( table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries}) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries}) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def get_rose_vars(srcdir=None, opts=None): """Load template variables from Rose suite configuration. Loads the Rose suite configuration tree from the filesystem using the shell environment. Args: srcdir(pathlib.Path): Path to the Rose suite configuration (the directory containing the ``rose-suite.conf`` file). opts: Options object containing specification of optional configuarations set by the CLI. Returns: dict - A dictionary of sections of rose-suite.conf. For each section either a dictionary or None is returned. E.g. { 'env': {'MYVAR': 42}, 'empy:suite.rc': None, 'jinja2:suite.rc': { 'myJinja2Var': {'yes': 'it is a dictionary!'} } } """ # Set up blank page for returns. config = {'env': {}, 'template_variables': {}, 'templating_detected': None} # Return a blank config dict if srcdir does not exist if not rose_config_exists(srcdir, opts): return config # Load the raw config tree config_tree = rose_config_tree_loader(srcdir, opts) # Warn if root-dir set in config: if 'root-dir' in config_tree.node: LOG.warning('You have set "root-dir", which at Cylc 8 does nothing. ' 'See Cylc Install documentation.') # Extract templatevars from the configuration get_rose_vars_from_config_node(config, config_tree.node, os.environ) # Export environment vars for key, val in config['env'].items(): os.environ[key] = val return config
def _get_host_metrics(self): """Run "cylc get-host-metrics" commands on hosts. Return (dict): {host: host-metrics-dict, ...} """ host_stats = {} # Run "cylc get-host-metrics" commands on hosts host_proc_map = {} cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts()) # Start up commands on hosts for host in self.hosts: if is_remote_host(host): host_proc_map[host] = remote_cylc_cmd(cmd, stdin=None, host=host, capture_process=True) elif 'localhost' in host_proc_map: continue # Don't duplicate localhost else: # 1st instance of localhost host_proc_map['localhost'] = run_cmd(['cylc'] + cmd, capture_process=True) # Collect results from commands while host_proc_map: for host, proc in list(host_proc_map.copy().items()): if proc.poll() is None: continue del host_proc_map[host] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): # Command failed in verbose/debug mode LOG.warning( "can't get host metric from '%s'" + "%s # returncode=%d, err=%s\n", host, ' '.join( (quote(item) for item in cmd)), proc.returncode, err) else: # Command OK # Users may have profile scripts that write to STDOUT. # Drop all output lines until the the first character of a # line is '{'. Hopefully this is enough to find us the # first line that denotes the beginning of the expected # JSON data structure. out = ''.join( dropwhile(lambda s: not s.startswith('{'), out.splitlines(True))) host_stats[host] = json.loads(out) sleep(0.01) return host_stats
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state(*TASK_STATUSES_ACTIVE): itask.state.reset(is_held=True) to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd(self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def process_mail_footer( mail_footer_tmpl: str, template_vars, ) -> str: """Process mail footer for workflow or task events. Returns an empty string if issues occur in processing. """ try: return (mail_footer_tmpl + '\n') % template_vars except (KeyError, ValueError): LOG.warning( f'Ignoring bad mail footer template: {mail_footer_tmpl}' ) return ''
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def _get_host_metrics(self): """Run "cylc get-host-metrics" commands on hosts. Return (dict): {host: host-metrics-dict, ...} """ host_stats = {} # Run "cylc get-host-metrics" commands on hosts host_proc_map = {} cmd = [self.CMD_BASE] + sorted(self._get_host_metrics_opts()) # Start up commands on hosts for host in self.hosts: if is_remote_host(host): host_proc_map[host] = remote_cylc_cmd( cmd, stdin=None, host=host, capture_process=True) elif 'localhost' in host_proc_map: continue # Don't duplicate localhost else: # 1st instance of localhost host_proc_map['localhost'] = run_cmd( ['cylc'] + cmd, capture_process=True) # Collect results from commands while host_proc_map: for host, proc in list(host_proc_map.copy().items()): if proc.poll() is None: continue del host_proc_map[host] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): # Command failed in verbose/debug mode LOG.warning( "can't get host metric from '%s'" + "%s # returncode=%d, err=%s\n", host, ' '.join((quote(item) for item in cmd)), proc.returncode, err) else: # Command OK # Users may have profile scripts that write to STDOUT. # Drop all output lines until the the first character of a # line is '{'. Hopefully this is enough to find us the # first line that denotes the beginning of the expected # JSON data structure. out = ''.join(dropwhile( lambda s: not s.startswith('{'), out.splitlines(True))) host_stats[host] = json.loads(out) sleep(0.01) return host_stats
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("[%s] -Vacated job restarted", itask) self.pflag = True job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) if itask.state.reset(TASK_STATUS_RUNNING): self.setup_event_handlers(itask, 'started', 'job started') self.job_pool.set_job_state(job_d, TASK_STATUS_RUNNING) itask.set_summary_time('started', event_time) self.job_pool.set_job_time(job_d, 'started', event_time) self._reset_job_timers(itask) self.suite_db_mgr.put_update_task_jobs( itask, {"time_run": itask.summary['started_time_string']}) # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user global config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str is None: # CYLC_CONF_PATH not defined, use default locations. for conf_dir_1, conf_dir_2, conf_type in [ (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD, upgrader.SITE_CONFIG), (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2, upgrader.USER_CONFIG)]: fname1 = os.path.join(conf_dir_1, self.CONF_BASE) fname2 = os.path.join(conf_dir_2, self.CONF_BASE) if os.access(fname1, os.F_OK | os.R_OK): fname = fname1 elif os.access(fname2, os.F_OK | os.R_OK): fname = fname2 else: continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning( 'ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise break elif conf_path_str: # CYLC_CONF_PATH defined with a value for path in conf_path_str.split(os.pathsep): fname = os.path.join(path, self.CONF_BASE) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) # (OK if no global.rc is found, just use system defaults). self.transform()
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flow.flags.debug: traceback.print_exc() err_log = ( "cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s" ) % {"file": self.db_file_name, "stmt": stmt} for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args}) LOG.warning(err_log) raise
def _remove_bad_hosts(self, mock_host_stats=None): """Return dictionary of 'good' hosts with their metric stats. Run 'get-host-metrics' on each run host in parallel & store extracted stats for hosts, else an empty JSON structure. Filter out 'bad' hosts whereby either metric data cannot be accessed from the command or at least one metric value does not pass a specified threshold. """ if mock_host_stats: # Create fake data for unittest purposes (only). host_stats = dict(mock_host_stats) # Prevent mutable object issues else: if not self.hosts: return {} host_stats = self._get_host_metrics() # Analyse get-host-metrics results for host, data in list(dict(host_stats).items()): if not data: # No results for host (command failed) -> skip. host_stats.pop(host) continue for measure, cutoff in self.parsed_thresholds.items(): datum = data[measure] # Cutoff is a minimum or maximum depending on measure context. if ((datum > cutoff and measure.startswith("load")) or (datum < cutoff and ( measure == "memory" or measure.startswith("disk-space")))): # Alert user that threshold has not been met. LOG.warning( "host '%s' did not pass %s threshold " + "(%s %s threshold %s)\n", host, measure, datum, ">" if measure.startswith("load") else "<", cutoff) host_stats.pop(host) break return host_stats
def register(self, reg=None, source=None, redirect=False): """Register a suite, or renew its registration. Create suite service directory and symlink to suite source location. Args: reg (str): suite name, default basename($PWD). source (str): directory location of suite.rc file, default $PWD. redirect (bool): allow reuse of existing name and run directory. Return: The registered suite name (which may be computed here). Raise: SuiteServiceFileError: No suite.rc file found in source location. Illegal name (can look like a relative path, but not absolute). Another suite already has this name (unless --redirect). """ if reg is None: reg = os.path.basename(os.getcwd()) if os.path.isabs(reg): raise SuiteServiceFileError( "suite name cannot be an absolute path: %s" % reg) if source is not None: if os.path.basename(source) == self.FILE_BASE_SUITE_RC: source = os.path.dirname(source) else: source = os.getcwd() # suite.rc must exist so we can detect accidentally reversed args. source = os.path.abspath(source) if not os.path.isfile(os.path.join(source, self.FILE_BASE_SUITE_RC)): raise SuiteServiceFileError("no suite.rc in %s" % source) # Create service dir if necessary. srv_d = self.get_suite_srv_dir(reg) os.makedirs(srv_d, exist_ok=True) # See if suite already has a source or not try: orig_source = os.readlink( os.path.join(srv_d, self.FILE_BASE_SOURCE)) except OSError: orig_source = None else: if not os.path.isabs(orig_source): orig_source = os.path.normpath( os.path.join(srv_d, orig_source)) if orig_source is not None and source != orig_source: if not redirect: raise SuiteServiceFileError( "the name '%s' already points to %s.\nUse " "--redirect to re-use an existing name and run " "directory." % (reg, orig_source)) LOG.warning( "the name '%(reg)s' points to %(old)s.\nIt will now" " be redirected to %(new)s.\nFiles in the existing %(reg)s run" " directory will be overwritten.\n", {'reg': reg, 'old': orig_source, 'new': source}) # Remove symlink to the original suite. os.unlink(os.path.join(srv_d, self.FILE_BASE_SOURCE)) # Create symlink to the suite, if it doesn't already exist. if orig_source is None or source != orig_source: target = os.path.join(srv_d, self.FILE_BASE_SOURCE) if (os.path.abspath(source) == os.path.abspath(os.path.dirname(srv_d))): # If source happens to be the run directory, # create .service/source -> .. source_str = ".." else: source_str = source os.symlink(source_str, target) print('REGISTERED %s -> %s' % (reg, source)) return reg
def _run_event_mail_callback(proc_ctx): """Callback the mail command for notification of a suite event.""" if proc_ctx.ret_code: LOG.warning(str(proc_ctx)) else: LOG.info(str(proc_ctx))