def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) run_d = glbl_cfg().get_host_item('run directory') results = [] for dirpath, dnames, _ in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a .service/ dir if dirpath != run_d and self.DIR_BASE_SRV in dnames: dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: LOG.error('%s: %s', reg, exc) return results
def _manip_task_jobs_callback(self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning('Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ("|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return modified_settings, bad_options
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return modified_settings, bad_options
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a workflow event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise WorkflowEventError(msg) else: LOG.info(str(proc_ctx))
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def publish(self, items): """Publish topics. Args: items (iterable): [(topic, data, serializer)] """ try: self.loop.run_until_complete(gather_coros(self.send_multi, items)) except Exception as exc: LOG.error('publish: %s', exc)
def _subshell_eval_callback(self, proc_ctx, cmd_str): """Callback when subshell eval command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: self.remote_command_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_command_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
async def publish(self, items): """Publish topics. Args: items (iterable): [(topic, data, serializer)] """ try: await gather_coros(self.send_multi, items) except Exception as exc: LOG.error('publish: %s', exc)
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'suite': quote(ctx.suite), 'suite_uuid': quote(str(ctx.uuid_str)), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SubProcContext(cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_handlers_callback)
def set_job_state(self, job_d, status): """Set job state.""" update_time = time() point, name, sub_num = self.parse_job_item(job_d) j_id = (f'{self.workflow_id}{ID_DELIM}{point}' f'{ID_DELIM}{name}{ID_DELIM}{sub_num}') if status in JOB_STATUSES_ALL: j_delta = PbJob(stamp=f'{j_id}@{update_time}', state=status) self.updates.setdefault(j_id, PbJob(id=j_id)).MergeFrom(j_delta) self.updates_pending = True else: LOG.error(f'Unable to set {j_id} state field to {status}')
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and ( ctx.event in self.get_events_conf(config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'suite': quote(ctx.suite), 'suite_uuid': quote(str(ctx.uuid_str)), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % ( handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SubProcContext( cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, self._run_event_handlers_callback)
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def _subshell_eval_callback(self, proc_ctx, cmd_str): """Callback when subshell eval command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: self.remote_command_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_command_map[cmd_str] = PlatformError( PlatformError.MSG_SELECT, None, ctx=proc_ctx, )
def add_job_msg(self, job_d, msg): """Add message to job.""" update_time = time() point, name, sub_num = self.parse_job_item(job_d) j_id = (f'{self.workflow_id}{ID_DELIM}{point}' f'{ID_DELIM}{name}{ID_DELIM}{sub_num}') try: j_delta = PbJob(stamp=f'{j_id}@{update_time}') j_delta.messages.append(msg) self.updates.setdefault(j_id, PbJob(id=j_id)).MergeFrom(j_delta) self.updates_pending = True except TypeError as exc: LOG.error(f'Unable to append to {j_id} message field: {str(exc)}')
def set_job_attr(self, job_d, attr_key, attr_val): """Set job attribute.""" update_time = time() point, name, sub_num = self.parse_job_item(job_d) j_id = (f'{self.workflow_id}{ID_DELIM}{point}' f'{ID_DELIM}{name}{ID_DELIM}{sub_num}') try: j_delta = PbJob(stamp=f'{j_id}@{update_time}') setattr(j_delta, attr_key, attr_val) self.updates.setdefault(j_id, PbJob(id=j_id)).MergeFrom(j_delta) self.updates_pending = True except (TypeError, AttributeError) as exc: LOG.error(f'Unable to set {j_id} data field: {str(exc)}')
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def _remote_init_callback(self, proc_ctx, platform, tmphandle, curve_auth, client_pub_key_dir): """Callback when "cylc remote-init" exits. Write public key for install target into client public key directory. Set remote_init__map status to REMOTE_INIT_DONE on success which in turn will trigger file installation to start. Set remote_init_map status to REMOTE_INIT_FAILED on error. """ with suppress(OSError): # E.g. ignore bad unlink, etc tmphandle.close() install_target = platform['install target'] if proc_ctx.ret_code == 0 and "KEYSTART" in proc_ctx.out: regex_result = re.search('KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out) key = regex_result.group(1) workflow_srv_dir = get_workflow_srv_dir(self.workflow) public_key = KeyInfo(KeyType.PUBLIC, KeyOwner.CLIENT, workflow_srv_dir=workflow_srv_dir, install_target=install_target) old_umask = os.umask(0o177) with open(public_key.full_key_path, 'w', encoding='utf8') as text_file: text_file.write(key) os.umask(old_umask) # configure_curve must be called every time certificates are # added or removed, in order to update the Authenticator's # state. curve_auth.configure_curve(domain='*', location=(client_pub_key_dir)) self.remote_init_map[install_target] = REMOTE_INIT_DONE self.ready = True return # Bad status LOG.error( PlatformError( PlatformError.MSG_INIT, platform['name'], cmd=proc_ctx.cmd, ret_code=proc_ctx.ret_code, out=proc_ctx.out, err=proc_ctx.err, )) self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED self.ready = True
def _run_event_custom_handlers(self, schd, template_variables, event): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers config = schd.config handlers = self.get_events_conf(config, '%s handlers' % event) if not handlers and ( event in self.get_events_conf(config, 'handler events', []) ): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.WORKFLOW_EVENT_HANDLER, i), event) try: cmd = handler % (template_variables) except KeyError as exc: message = f'{cmd_key} bad template: {handler}\n{exc}' LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = ( f"%(handler)s" f" '%({EventData.Event.value})s'" f" '%({EventData.Workflow.value})s'" f" '%({EventData.Message.value})s'" ) % ( {'handler': handler, **template_variables} ) proc_ctx = SubProcContext( cmd_key, cmd, env=dict(os.environ), shell=True # nosec (designed to run user defined code) ) if self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_handlers_callback)
def set_job_time(self, job_d, event_key, time_str=None): """Set an event time in job pool object. Set values of both event_key + '_time' and event_key + '_time_string'. """ update_time = time() point, name, sub_num = self.parse_job_item(job_d) j_id = (f'{self.workflow_id}{ID_DELIM}{point}' f'{ID_DELIM}{name}{ID_DELIM}{sub_num}') try: j_delta = PbJob(stamp=f'{j_id}@{update_time}') time_attr = f'{event_key}_time' setattr(j_delta, time_attr, time_str) self.updates.setdefault(j_id, PbJob(id=j_id)).MergeFrom(j_delta) self.updates_pending = True except (TypeError, AttributeError) as exc: LOG.error(f'Unable to set {j_id} {time_attr} field: {str(exc)}')
def _killpg(proc, signal): """Kill a process group.""" try: os.killpg(proc.pid, signal) except ProcessLookupError: # process group has already exited return False except PermissionError: # process group may contain zombie processes which will result in # PermissionError on some systems, not sure what happens on others # # we could go through the processes in the group and call waitpid on # them but waitpid is blocking and this would be a messy solution for a # problem that shouldn't happen (it's really a bug in the Cylc subproc) LOG.error(f'Could not kill process group: {proc.pid}' f'\nCommand: {" ".join(proc.args)}') return False return True
def file_install(self, platform): """Install required files on the remote install target. Included by default in the file installation: Files: .service/server.key (required for ZMQ authentication) Directories: app/ bin/ etc/ lib/ """ install_target = platform['install target'] self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_IN_PROGRESS src_path = get_workflow_run_dir(self.workflow) dst_path = get_remote_workflow_run_dir(self.workflow) install_target = platform['install target'] try: cmd, host = construct_rsync_over_ssh_cmd(src_path, dst_path, platform, self.rsync_includes, bad_hosts=self.bad_hosts) ctx = SubProcContext('file-install', cmd, host) except NoHostsError as exc: LOG.error( PlatformError( f'{PlatformError.MSG_INIT}\n{exc}', platform['name'], )) self.remote_init_map[ platform['install target']] = REMOTE_FILE_INSTALL_FAILED self.bad_hosts -= set(platform['hosts']) self.ready = True else: log_platform_event('file install', platform, host) self.proc_pool.put_command( ctx, bad_hosts=self.bad_hosts, callback=self._file_install_callback, callback_args=[platform, install_target], callback_255=self._file_install_callback_255, )
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message. Return True if no retries (hence go to the submit-failed state). """ no_retries = False LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED) itask.summary['submit_method_id'] = None self.pflag = True if ( TimerFlags.SUBMISSION_RETRY not in itask.try_timers or itask.try_timers[TimerFlags.SUBMISSION_RETRY].next() is None ): # No submission retry lined up: definitive failure. # See github #476. no_retries = True if itask.state.reset(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, f'job {self.EVENT_SUBMIT_FAILED}') else: # There is a submission retry lined up. timer = itask.try_timers[TimerFlags.SUBMISSION_RETRY] self._retry_task(itask, timer.timeout, submit_retry=True) delay_msg = f"submit-retrying in {timer.delay_timeout_as_str()}" if itask.state.is_held: delay_msg = f"held ({delay_msg})" msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, f"job {self.EVENT_SUBMIT_FAILED}, {delay_msg}") self._reset_job_timers(itask) return no_retries
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user global config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str is None: # CYLC_CONF_PATH not defined, use default locations. for conf_dir_1, conf_dir_2, conf_type in [ (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD, upgrader.SITE_CONFIG), (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2, upgrader.USER_CONFIG)]: fname1 = os.path.join(conf_dir_1, self.CONF_BASE) fname2 = os.path.join(conf_dir_2, self.CONF_BASE) if os.access(fname1, os.F_OK | os.R_OK): fname = fname1 elif os.access(fname2, os.F_OK | os.R_OK): fname = fname2 else: continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning( 'ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise break elif conf_path_str: # CYLC_CONF_PATH defined with a value for path in conf_path_str.split(os.pathsep): fname = os.path.join(path, self.CONF_BASE) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) # (OK if no global.rc is found, just use system defaults). self.transform()
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str: # Explicit config file override. fname = os.path.join(conf_path_str, self.CONF_BASENAME) self._load(fname, upgrader.USER_CONFIG) elif conf_path_str is None: # Use default locations. for conf_type, conf_dir in self.conf_dir_hierarchy: fname = os.path.join(conf_dir, self.CONF_BASENAME) try: self._load(fname, conf_type) except ParsecError: LOG.error(f'bad {conf_type} {fname}') raise self._set_default_editors()
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log(suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write((ctx_str + '\n').encode()) except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', None) itask.summary['submit_method_id'] = None self.pflag = True if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_FAILED) elif itask.state.reset_state( TASK_STATUS_SUBMIT_RETRYING, respect_hold_swap=True, ): # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() if itask.state.status == TASK_STATUS_HELD: delay_msg = "%s (%s)" % (TASK_STATUS_HELD, delay_msg) msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self.job_pool.set_job_state(job_d, TASK_STATUS_SUBMIT_RETRYING) self._reset_job_timers(itask)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" if proc_ctx.ret_code: LOG.error(proc_ctx) else: LOG.debug(proc_ctx) for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists( get_task_job_log(schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def _file_install_callback(self, ctx, platform, install_target): """Callback when file installation exits. Sets remote_init_map to REMOTE_FILE_INSTALL_DONE on success and to REMOTE_FILE_INSTALL_FAILED on error. """ if ctx.out: RSYNC_LOG.info('File installation information for ' f'{install_target}:\n{ctx.out}') if ctx.ret_code == 0: # Both file installation and remote init success LOG.debug(f"File installation complete for {install_target}") self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE self.ready = True return else: self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_FAILED LOG.error( PlatformError( PlatformError.MSG_INIT, platform['name'], ctx=ctx, )) self.ready = True
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf(itask, 'handler retry delays') if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. platform_n = itask.summary['platforms_used'].get( itask.submit_num, '') # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { EventData.BatchSysJobID.value: quote(str(itask.summary['submit_method_id'])), EventData.BatchSysName.value: quote(str(itask.summary['batch_sys_name'])), EventData.CyclePoint.value: quote(str(itask.point)), EventData.Event.value: quote(event), EventData.FinishTime.value: quote(str(itask.summary['finished_time_string'])), EventData.ID.value: quote(itask.identity), EventData.Message.value: quote(message), EventData.TaskName.value: quote(itask.tdef.name), EventData.PlatformName.value: quote(platform_n), EventData.StartTime.value: quote(str(itask.summary['started_time_string'])), EventData.SubmitNum.value: itask.submit_num, EventData.SubmitTime.value: quote(str(itask.summary['submitted_time_string'])), EventData.Suite.value: quote(self.suite), EventData.SuiteUUID.value: quote(str(self.uuid_str)), EventData.TryNum.value: itask.get_try_num(), # task and suite metadata **get_event_handler_data(itask.tdef.rtconfig, self.suite_cfg) } cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def _remote_init_callback( self, proc_ctx, platform, tmphandle, curve_auth, client_pub_key_dir): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass self.install_target = platform['install target'] if proc_ctx.ret_code == 0: if REMOTE_INIT_DONE in proc_ctx.out: src_path = get_suite_run_dir(self.suite) dst_path = get_remote_suite_run_dir(platform, self.suite) try: process = procopen(construct_rsync_over_ssh_cmd( src_path, dst_path, platform, self.rsync_includes), stdoutpipe=True, stderrpipe=True, universal_newlines=True) out, err = process.communicate(timeout=600) install_target = platform['install target'] if out: RSYNC_LOG.info( 'File installation information for ' f'{install_target}:\n {out}') if err: LOG.error( 'File installation error on ' f'{install_target}:\n {err}') except Exception as ex: LOG.error(f"Problem during rsync: {ex}") self.remote_init_map[self.install_target] = ( REMOTE_INIT_FAILED) return if "KEYSTART" in proc_ctx.out: regex_result = re.search( 'KEYSTART((.|\n|\r)*)KEYEND', proc_ctx.out) key = regex_result.group(1) suite_srv_dir = get_suite_srv_dir(self.suite) public_key = KeyInfo( KeyType.PUBLIC, KeyOwner.CLIENT, suite_srv_dir=suite_srv_dir, install_target=self.install_target ) old_umask = os.umask(0o177) with open( public_key.full_key_path, 'w', encoding='utf8') as text_file: text_file.write(key) os.umask(old_umask) # configure_curve must be called every time certificates are # added or removed, in order to update the Authenticator's # state. curve_auth.configure_curve( domain='*', location=(client_pub_key_dir)) for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[self.install_target] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, platform['install target'], ' '.join( quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[platform['install target']] = REMOTE_INIT_FAILED
def remote_init(self, platform: Dict[str, Any], curve_auth: 'ThreadAuthenticator', client_pub_key_dir: str) -> None: """Initialise a remote host if necessary. Call "cylc remote-init" to install workflow items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: platform: A dict containing settings relating to platform used in this remote installation. curve_auth: The ZMQ authenticator. client_pub_key_dir: Client public key directory, used by the ZMQ authenticator. """ install_target = platform['install target'] if install_target == get_localhost_install_target(): self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE return # Set status of install target to in progress while waiting for remote # initialisation to finish self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS # Determine what items to install comms_meth: CommsMeth = CommsMeth(platform['communication method']) items = self._remote_init_items(comms_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity)) cmd.append(str(install_target)) cmd.append(get_remote_workflow_run_dir(self.workflow)) dirs_to_symlink = get_dirs_to_symlink(install_target, self.workflow) for key, value in dirs_to_symlink.items(): if value is not None: cmd.append(f"{key}={quote(value)} ") # Create the ssh command try: host = get_host_from_platform(platform, bad_hosts=self.bad_hosts) except NoHostsError as exc: LOG.error( PlatformError( f'{PlatformError.MSG_INIT}\n{exc}', platform['name'], )) self.remote_init_map[ platform['install target']] = REMOTE_INIT_FAILED self.bad_hosts -= set(platform['hosts']) self.ready = True else: log_platform_event('remote init', platform, host) cmd = construct_ssh_cmd(cmd, platform, host) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle], host=host), bad_hosts=self.bad_hosts, callback=self._remote_init_callback, callback_args=[ platform, tmphandle, curve_auth, client_pub_key_dir ], callback_255=self._remote_init_callback_255, callback_255_args=[platform])
def _test_callback_255(ctx, foo=''): """Very Simple test callback function""" LOG.error(f'255 callback called.{foo}')
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), 'suite_uuid': quote(str(self.uuid_str)), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote(str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote(str(itask.summary['submit_method_id'])), "submit_time": quote(str(itask.summary['submitted_time_string'])), "start_time": quote(str(itask.summary['started_time_string'])), "finish_time": quote(str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))